Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created November 12, 2024 06:41
Show Gist options
  • Save pashu123/d477a0d4f1b821aa653c58b23c74ae7a to your computer and use it in GitHub Desktop.
Save pashu123/d477a0d4f1b821aa653c58b23c74ae7a to your computer and use it in GitHub Desktop.
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
}
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
module {
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = tensor.empty() : tensor<1024x1024xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.copy %subview_1, %subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMGPUTileAndDistributePass (iree-llvmgpu-tile-and-distribute) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
scf.for %arg2 = %3 to %c32 step %c32 {
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
scf.for %arg3 = %4 to %c32 step %c32 {
%subview_4 = memref.subview %alloc_1[%arg2, %arg3] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
}
scf.for %arg2 = %c0 to %c1024 step %c16 {
%subview_4 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_4, %alloc_0 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, #gpu.address_space<workgroup>>
memref.copy %subview_5, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%thread_id_x_6 = gpu.thread_id x
%thread_id_y_7 = gpu.thread_id y
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_7]
scf.for %arg3 = %4 to %c32 step %c32 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_6)
scf.for %arg4 = %5 to %c32 step %c32 {
%subview_8 = memref.subview %alloc_0[%arg3, 0] [16, 16] [1, 1] : memref<32x16xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_9 = memref.subview %alloc[0, %arg4] [16, 16] [1, 1] : memref<16x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %alloc_1[%arg3, %arg4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_8, %subview_9 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
}
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%subview_5 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_5, %alloc_0 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, #gpu.address_space<workgroup>>
memref.copy %subview_6, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%thread_id_x_7 = gpu.thread_id x
%thread_id_y_8 = gpu.thread_id y
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_8]
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_7)
%subview_9 = memref.subview %alloc_0[%5, 0] [16, 16] [1, 1] : memref<32x16xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %alloc[0, %6] [16, 16] [1, 1] : memref<16x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_11 = memref.subview %alloc_1[%5, %6] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_11 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUMultiBufferingPass (iree-codegen-gpu-multi-buffering) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_6 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%thread_id_x_9 = gpu.thread_id x
%thread_id_y_10 = gpu.thread_id y
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_10]
%8 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_9)
%subview_11 = memref.subview %subview_5[%7, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_12 = memref.subview %subview_6[0, %8] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_13 = memref.subview %alloc_1[%7, %8] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_11, %subview_12 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_13 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_6 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%thread_id_x_9 = gpu.thread_id x
%thread_id_y_10 = gpu.thread_id y
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_10]
%8 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_9)
%subview_11 = memref.subview %subview_5[%7, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_12 = memref.subview %subview_6[0, %8] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_13 = memref.subview %alloc_1[%7, %8] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_11, %subview_12 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_13 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After ReorderWorkgroupsPass (iree-codegen-reorder-workgroups) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
}
gpu.barrier
memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMGPUTensorCoreVectorizationPass (iree-llvmgpu-tensorcore-vectorization) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%subview_5 = memref.subview %alloc_2[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
vector.transfer_write %cst, %subview_5[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_6 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_8 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_3[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_7 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_10 = memref.subview %subview_6[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_11 = memref.subview %subview_7[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%6 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x8xf32>
%7 = vector.transfer_read %subview_10[%c0, %c8], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x8xf32>
%8 = vector.transfer_read %subview_11[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<8x16xf32>
%9 = vector.transfer_read %subview_11[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<8x16xf32>
%10 = vector.transfer_read %subview_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x16xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %8, %10 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
vector.transfer_write %12, %subview_5[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
memref.copy %alloc_2, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
vector.transfer_write %cst, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_3 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%8 = vector.transfer_read %alloc_1[%6, %7, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = vector.transfer_read %alloc_1[%9, %10, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%12 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%13 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%14 = vector.transfer_read %alloc[%12, %c0, %13], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%16 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%17 = vector.transfer_read %alloc[%15, %c8, %16], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%18 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
%20 = vector.transfer_read %alloc_2[%18, %19], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<16x16xf32>
%21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %14, %20 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %17, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%23 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
vector.transfer_write %22, %alloc_2[%23, %24] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
}
gpu.barrier
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
vector.transfer_write %cst, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
scf.for %arg2 = %c0 to %c1024 step %c16 {
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_3 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%6 = vector.transfer_read %alloc_1[%5, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%7 = vector.transfer_read %alloc_1[%5, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%8 = vector.transfer_read %alloc[%5, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%9 = vector.transfer_read %alloc[%5, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%10 = vector.transfer_read %alloc_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<16x16xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %8, %10 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
vector.transfer_write %12, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
}
gpu.barrier
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %12 : vector<16x16xf32>
}
vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %12 : vector<16x16xf32>
}
vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
%6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_5 : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
gpu.barrier
%7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %12 : vector<16x16xf32>
}
vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc_2 : memref<32x32xf32, #gpu.address_space<workgroup>>) outs(%subview : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUDistributeSharedMemoryCopyPass (iree-codegen-gpu-distribute-shared-memory-copy) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 64 + s2 * 128)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%c8 = arith.constant 8 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%cst_2 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x_3 = gpu.thread_id x
%thread_id_y_4 = gpu.thread_id y
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_4]
%5 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_3)
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %3[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
%22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_32 = memref.subview %alloc_0[%22, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_33 = memref.subview %alloc_1[%22, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_34 = memref.subview %1[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_35 = memref.subview %2[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%c32_36 = arith.constant 32 : index
%c16_37 = arith.constant 16 : index
%c0_38 = arith.constant 0 : index
%c32_39 = arith.constant 32 : index
%c32_40 = arith.constant 32 : index
%c0_41 = arith.constant 0 : index
%c16_42 = arith.constant 16 : index
%c16_43 = arith.constant 16 : index
%subview_44 = memref.subview %subview_34[%c0_38, %c0_41] [32, 16] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_45 = memref.subview %subview_32[%c0_38, %c0_41] [32, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%c1_46 = arith.constant 1 : index
%c4_47 = arith.constant 4 : index
%23 = affine.apply affine_map<()[s0] -> (s0 mod 4)>()[%thread_id_x]
%c4_48 = arith.constant 4 : index
%24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c32_49 = arith.constant 32 : index
%25 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 16 + s2 * 32 + s0 floordiv 4) floordiv 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c0_50 = arith.constant 0 : index
%c32_51 = arith.constant 32 : index
%c1_52 = arith.constant 1 : index
%c0_53 = arith.constant 0 : index
%c16_54 = arith.constant 16 : index
%c4_55 = arith.constant 4 : index
%26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%27 = affine.apply affine_map<() -> (32)>()
%28 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%29 = affine.apply affine_map<() -> (16)>()
%30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%31 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%32 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%33 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%subview_56 = memref.subview %subview_44[%30, %31] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_57 = memref.subview %subview_45[%32, %33] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%c1_58 = arith.constant 1 : index
%c4_59 = arith.constant 4 : index
%c0_60 = arith.constant 0 : index
%cst_61 = arith.constant 0.000000e+00 : f32
%34 = vector.transfer_read %subview_56[%c0_60, %c0_60], %cst_61 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%cst_62 = arith.constant 0.000000e+00 : f32
%35 = vector.transfer_read %subview_57[%c0_60, %c0_60], %cst_62 : memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
%c0_63 = arith.constant 0 : index
vector.transfer_write %34, %subview_57[%c0_63, %c0_63] : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%c16_64 = arith.constant 16 : index
%c32_65 = arith.constant 32 : index
%c0_66 = arith.constant 0 : index
%c16_67 = arith.constant 16 : index
%c16_68 = arith.constant 16 : index
%c0_69 = arith.constant 0 : index
%c32_70 = arith.constant 32 : index
%c32_71 = arith.constant 32 : index
%subview_72 = memref.subview %subview_35[%c0_66, %c0_69] [16, 32] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_73 = memref.subview %subview_33[%c0_66, %c0_69] [16, 32] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%c1_74 = arith.constant 1 : index
%c4_75 = arith.constant 4 : index
%36 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
%c8_76 = arith.constant 8 : index
%37 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c16_77 = arith.constant 16 : index
%38 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c0_78 = arith.constant 0 : index
%c16_79 = arith.constant 16 : index
%c1_80 = arith.constant 1 : index
%c0_81 = arith.constant 0 : index
%c32_82 = arith.constant 32 : index
%c4_83 = arith.constant 4 : index
%39 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%40 = affine.apply affine_map<() -> (16)>()
%41 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%42 = affine.apply affine_map<() -> (32)>()
%43 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%44 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%45 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%46 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_84 = memref.subview %subview_72[%43, %44] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_85 = memref.subview %subview_73[%45, %46] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%c1_86 = arith.constant 1 : index
%c4_87 = arith.constant 4 : index
%c0_88 = arith.constant 0 : index
%cst_89 = arith.constant 0.000000e+00 : f32
%47 = vector.transfer_read %subview_84[%c0_88, %c0_88], %cst_89 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%cst_90 = arith.constant 0.000000e+00 : f32
%48 = vector.transfer_read %subview_85[%c0_88, %c0_88], %cst_90 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
%c0_91 = arith.constant 0 : index
vector.transfer_write %47, %subview_85[%c0_91, %c0_91] : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%49 = vector.transfer_read %alloc_0[%22, %4, %c0], %cst_2 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%50 = vector.transfer_read %alloc_0[%22, %4, %c8], %cst_2 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%51 = vector.transfer_read %alloc_1[%22, %c0, %5], %cst_2 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%52 = vector.transfer_read %alloc_1[%22, %c8, %5], %cst_2 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %51, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %50, %52, %53 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %54 : vector<16x16xf32>
}
vector.transfer_write %6, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%c16_5 = arith.constant 16 : index
%c32 = arith.constant 32 : index
%c0_6 = arith.constant 0 : index
%c32_7 = arith.constant 32 : index
%c16_8 = arith.constant 16 : index
%c0_9 = arith.constant 0 : index
%c32_10 = arith.constant 32 : index
%c32_11 = arith.constant 32 : index
%c32_12 = arith.constant 32 : index
%subview_13 = memref.subview %alloc[%c0_6, %c0_9] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_14 = memref.subview %subview[%c0_6, %c0_9] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%7 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
%c8_15 = arith.constant 8 : index
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c16_16 = arith.constant 16 : index
%9 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c0_17 = arith.constant 0 : index
%c16_18 = arith.constant 16 : index
%c1_19 = arith.constant 1 : index
%c0_20 = arith.constant 0 : index
%c32_21 = arith.constant 32 : index
%c4_22 = arith.constant 4 : index
%10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<() -> (16)>()
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%13 = affine.apply affine_map<() -> (32)>()
%14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%16 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_23 = memref.subview %subview_13[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_24 = memref.subview %subview_14[%16, %17] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%c1_25 = arith.constant 1 : index
%c4_26 = arith.constant 4 : index
%c0_27 = arith.constant 0 : index
%cst_28 = arith.constant 0.000000e+00 : f32
%18 = vector.transfer_read %subview_23[%c0_27, %c0_27], %cst_28 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
%cst_29 = arith.constant 0.000000e+00 : f32
%19 = vector.transfer_read %subview_24[%c0_27, %c0_27], %cst_29 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%c0_30 = arith.constant 0 : index
vector.transfer_write %18, %subview_24[%c0_30, %c0_30] : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%c1_31 = arith.constant 1 : index
%20 = arith.muli %c16_8, %c1_31 : index
%21 = arith.addi %c0_6, %20 : index
scf.for %arg2 = %c0_9 to %c32_10 step %c32_11 {
%subview_32 = memref.subview %alloc[%21, %arg2] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_33 = memref.subview %subview[%21, %arg2] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%c1_34 = arith.constant 1 : index
%c4_35 = arith.constant 4 : index
%22 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
%c8_36 = arith.constant 8 : index
%23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c16_37 = arith.constant 16 : index
%24 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c0_38 = arith.constant 0 : index
%c16_39 = arith.constant 16 : index
%c1_40 = arith.constant 1 : index
%c0_41 = arith.constant 0 : index
%c32_42 = arith.constant 32 : index
%c4_43 = arith.constant 4 : index
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%26 = affine.apply affine_map<() -> (16)>()
%27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%28 = affine.apply affine_map<() -> (32)>()
%29 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%32 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_44 = memref.subview %subview_32[%29, %30] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_45 = memref.subview %subview_33[%31, %32] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%c1_46 = arith.constant 1 : index
%c4_47 = arith.constant 4 : index
%c0_48 = arith.constant 0 : index
%cst_49 = arith.constant 0.000000e+00 : f32
%33 = vector.transfer_read %subview_44[%c0_48, %c0_48], %cst_49 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
%cst_50 = arith.constant 0.000000e+00 : f32
%34 = vector.transfer_read %subview_45[%c0_48, %c0_48], %cst_50 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%c0_51 = arith.constant 0 : index
vector.transfer_write %33, %subview_45[%c0_51, %c0_51] : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%c8 = arith.constant 8 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x_3 = gpu.thread_id x
%thread_id_y_4 = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_4]
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x_3]
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
%16 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_13 = memref.subview %alloc_1[%16, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_14 = memref.subview %alloc_2[%16, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_15 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_16 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%subview_17 = memref.subview %subview_15[%17, %18] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_18 = memref.subview %subview_13[%19, %20] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%21 = vector.transfer_read %subview_17[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %21, %subview_18[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_19 = memref.subview %subview_16[%22, %23] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_20 = memref.subview %subview_14[%24, %25] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = vector.transfer_read %subview_19[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %26, %subview_20[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%27 = vector.transfer_read %alloc_1[%16, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%28 = vector.transfer_read %alloc_1[%16, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%29 = vector.transfer_read %alloc_2[%16, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%30 = vector.transfer_read %alloc_2[%16, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %29, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %30, %31 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %32 : vector<16x16xf32>
}
vector.transfer_write %5, %alloc[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%subview_5 = memref.subview %alloc[0, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_7 = memref.subview %subview_5[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_8 = memref.subview %subview_6[%8, %9] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%10 = vector.transfer_read %subview_7[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %10, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %alloc[16, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%13 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%14 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_11 = memref.subview %subview_9[%11, %12] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_12 = memref.subview %subview_10[%13, %14] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%15 = vector.transfer_read %subview_11[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %15, %subview_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%c8 = arith.constant 8 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
%10 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_11 = memref.subview %alloc_1[%10, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_12 = memref.subview %alloc_2[%10, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_13 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_14 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%subview_15 = memref.subview %subview_13[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_16 = memref.subview %subview_11[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%13 = vector.transfer_read %subview_15[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %13, %subview_16[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_17 = memref.subview %subview_14[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_18 = memref.subview %subview_12[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%16 = vector.transfer_read %subview_17[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %16, %subview_18[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%17 = vector.transfer_read %alloc_1[%10, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%18 = vector.transfer_read %alloc_1[%10, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%19 = vector.transfer_read %alloc_2[%10, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%20 = vector.transfer_read %alloc_2[%10, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %19, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %20, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %22 : vector<16x16xf32>
}
vector.transfer_write %5, %alloc[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%subview_3 = memref.subview %alloc[0, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_5 = memref.subview %subview_3[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_6 = memref.subview %subview_4[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = vector.transfer_read %subview_5[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %alloc[16, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>>
%subview_8 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_7[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_8[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %9, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%c8 = arith.constant 8 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [32, 32] [1, 1] : memref<32x36xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%subview_2 = memref.subview %alloc_1[0, 0, 0] [4, 32, 16] [1, 1, 1] : memref<4x32x20xf32, #gpu.address_space<workgroup>> to memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0, 0] [4, 16, 32] [1, 1, 1] : memref<4x16x36xf32, #gpu.address_space<workgroup>> to memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%subview_5 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
%10 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%subview_14 = memref.subview %subview_2[%10, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_15 = memref.subview %subview_4[%10, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_16 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_17 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%subview_18 = memref.subview %subview_16[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_19 = memref.subview %subview_14[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%13 = vector.transfer_read %subview_18[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %13, %subview_19[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_20 = memref.subview %subview_17[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_21 = memref.subview %subview_15[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
%16 = vector.transfer_read %subview_20[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
vector.transfer_write %16, %subview_21[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%17 = vector.transfer_read %subview_2[%10, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>, vector<16x8xf32>
%18 = vector.transfer_read %subview_2[%10, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>, vector<16x8xf32>
%19 = vector.transfer_read %subview_4[%10, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>, vector<8x16xf32>
%20 = vector.transfer_read %subview_4[%10, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>, vector<8x16xf32>
%21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %19, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %20, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %22 : vector<16x16xf32>
}
vector.transfer_write %5, %subview[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
gpu.barrier
%subview_6 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_5[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%subview_8 = memref.subview %subview_6[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_9 = memref.subview %subview_7[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = vector.transfer_read %subview_8[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %8, %subview_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_10 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1], offset: 576>, #gpu.address_space<workgroup>>
%subview_11 = memref.subview %subview_5[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %subview_10[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1], offset: 576>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_13 = memref.subview %subview_11[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_12[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
vector.transfer_write %9, %subview_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%18 = vector.transfer_read %0[%16, %17], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%21 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
vector.transfer_write %18, %alloc_1[%19, %20, %21] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%24 = vector.transfer_read %1[%22, %23], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
vector.transfer_write %24, %alloc_2[%25, %26, %27] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%28 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%30 = vector.transfer_read %alloc_1[%28, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%32 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%33 = vector.transfer_read %alloc_1[%31, %32, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%34 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%36 = vector.transfer_read %alloc_2[%34, %c0, %35], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%37 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%39 = vector.transfer_read %alloc_2[%37, %c8, %38], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %39, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %41 : vector<16x16xf32>
}
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%13 = vector.transfer_read %alloc[%11, %12], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%18 = vector.transfer_read %0[%16, %17], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%21 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
vector.transfer_write %18, %alloc_1[%19, %20, %21] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%24 = vector.transfer_read %1[%22, %23], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
vector.transfer_write %24, %alloc_2[%25, %26, %27] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%28 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%30 = vector.transfer_read %alloc_1[%28, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%32 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%33 = vector.transfer_read %alloc_1[%31, %32, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%34 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%36 = vector.transfer_read %alloc_2[%34, %c0, %35], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%37 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%39 = vector.transfer_read %alloc_2[%37, %c8, %38], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %39, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %41 : vector<16x16xf32>
}
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%13 = vector.transfer_read %alloc[%11, %12], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
gpu.barrier
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%16 = vector.transfer_read %0[%14, %15], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_1[%17, %18, %19] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
%20 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%22 = vector.transfer_read %1[%20, %21], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
%23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
vector.transfer_write %22, %alloc_2[%17, %23, %24] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%25 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%26 = vector.transfer_read %alloc_1[%17, %25, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%27 = vector.transfer_read %alloc_1[%17, %25, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%28 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%29 = vector.transfer_read %alloc_2[%17, %c0, %28], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%30 = vector.transfer_read %alloc_2[%17, %c8, %28], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %29, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %30, %31 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %32 : vector<16x16xf32>
}
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%12 = vector.transfer_read %alloc[%11, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
%13 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %12, %2[%13, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMGPUVectorToGPUPass (iree-llvmgpu-vector-to-gpu) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst_0 : !gpu.mma_matrix<16x16xf32, "COp">
%cst_1 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4:2 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_1, %arg4 = %0) -> (vector<16x16xf32>, !gpu.mma_matrix<16x16xf32, "COp">) {
gpu.barrier
%15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%17 = vector.transfer_read %1[%15, %16], %cst {in_bounds = [true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
%18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%15, %16], %alloc_2[%18, %19, %20], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%24 = vector.transfer_read %2[%22, %23], %cst {in_bounds = [true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%27 = nvgpu.device_async_copy %2[%22, %23], %alloc_3[%18, %25, %26], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%28 = nvgpu.device_async_create_group %21, %27
nvgpu.device_async_wait %28
gpu.barrier
%29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%30 = gpu.subgroup_mma_load_matrix %alloc_2[%18, %29, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%31 = vector.transfer_read %alloc_2[%18, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%32 = gpu.subgroup_mma_load_matrix %alloc_2[%18, %29, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%33 = vector.transfer_read %alloc_2[%18, %29, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%35 = gpu.subgroup_mma_load_matrix %alloc_3[%18, %c0, %34] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%36 = vector.transfer_read %alloc_3[%18, %c0, %34], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%37 = gpu.subgroup_mma_load_matrix %alloc_3[%18, %c8, %34] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%38 = vector.transfer_read %alloc_3[%18, %c8, %34], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
%39 = gpu.subgroup_mma_compute %30, %35, %arg4 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %31, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
%41 = gpu.subgroup_mma_compute %32, %37, %39 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %38, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
scf.yield %cst_1, %41 : vector<16x16xf32>, !gpu.mma_matrix<16x16xf32, "COp">
}
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %4#1, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
gpu.barrier
%15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%20 = nvgpu.device_async_copy %1[%15, %16], %alloc_0[%17, %18, %19], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%21 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%25 = nvgpu.device_async_copy %2[%21, %22], %alloc_1[%17, %23, %24], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%26 = nvgpu.device_async_create_group %20, %25
nvgpu.device_async_wait %26
gpu.barrier
%27 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%28 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%29 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%31 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c0, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%32 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c8, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%33 = gpu.subgroup_mma_compute %28, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%34 = gpu.subgroup_mma_compute %29, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
}
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %4, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
gpu.barrier
%15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%20 = nvgpu.device_async_copy %1[%15, %16], %alloc_0[%17, %18, %19], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%21 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%25 = nvgpu.device_async_copy %2[%21, %22], %alloc_1[%17, %23, %24], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%26 = nvgpu.device_async_create_group %20, %25
nvgpu.device_async_wait %26
gpu.barrier
%27 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%28 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%29 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%31 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c0, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%32 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c8, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%33 = gpu.subgroup_mma_compute %28, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%34 = gpu.subgroup_mma_compute %29, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
}
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %4, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%12 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
gpu.barrier
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
%25 = nvgpu.device_async_copy %1[%4, %23], %alloc_0[%24, %5, %6], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%26 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
%27 = nvgpu.device_async_copy %2[%26, %7], %alloc_1[%24, %8, %9], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%28 = nvgpu.device_async_create_group %25, %27
nvgpu.device_async_wait %28
gpu.barrier
%29 = gpu.subgroup_mma_load_matrix %alloc_0[%24, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%30 = gpu.subgroup_mma_load_matrix %alloc_0[%24, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%31 = gpu.subgroup_mma_load_matrix %alloc_1[%24, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%32 = gpu.subgroup_mma_load_matrix %alloc_1[%24, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%33 = gpu.subgroup_mma_compute %29, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%34 = gpu.subgroup_mma_compute %30, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
}
%13 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%14 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %12, %alloc[%13, %14] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%15 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%17 = vector.transfer_read %alloc[%15, %16], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%18 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %17, %3[%18, %19] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%21 = vector.transfer_read %alloc[%20, %16], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %21, %3[%22, %19] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUPipeliningPass (iree-codegen-gpu-pipelining) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%c0_2 = arith.constant 0 : index
%12 = arith.muli %c16, %c0_2 : index
%13 = arith.addi %c0, %12 : index
gpu.barrier {__pipelining_first_stage__}
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
%15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
%16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
%c1 = arith.constant 1 : index
%20 = arith.muli %c16, %c1 : index
%21 = arith.addi %c0, %20 : index
gpu.barrier {__pipelining_first_stage__}
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
%24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
%26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
%c2 = arith.constant 2 : index
%28 = arith.muli %c16, %c2 : index
%29 = arith.addi %c0, %28 : index
gpu.barrier {__pipelining_first_stage__}
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
%31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
%32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
%34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
%c3 = arith.constant 3 : index
%36 = arith.muli %c16, %c3 : index
%37 = arith.addi %c0, %36 : index
gpu.barrier {__pipelining_first_stage__}
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
%39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
%40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
%42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
%44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%c4 = arith.constant 4 : index
%55 = arith.muli %c16, %c4 : index
%56 = arith.subi %c1024, %55 : index
%57 = arith.cmpi slt, %arg2, %56 : index
%c3_3 = arith.constant 3 : index
%58 = arith.muli %c16, %c3_3 : index
%59 = arith.subi %c1024, %58 : index
%60 = arith.cmpi slt, %arg2, %59 : index
%c2_4 = arith.constant 2 : index
%61 = arith.muli %c16, %c2_4 : index
%62 = arith.subi %c1024, %61 : index
%63 = arith.cmpi slt, %arg2, %62 : index
%c1_5 = arith.constant 1 : index
%64 = arith.muli %c16, %c1_5 : index
%65 = arith.subi %c1024, %64 : index
%66 = arith.cmpi slt, %arg2, %65 : index
nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
gpu.barrier
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%c4_6 = arith.constant 4 : index
%73 = arith.muli %c16, %c4_6 : index
%74 = arith.addi %arg2, %73 : index
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
%c4_7 = arith.constant 4 : index
%76 = arith.muli %c16, %c4_7 : index
%77 = arith.addi %arg2, %76 : index
%78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%79 = arith.select %57, %c4_8, %c0_9 : index
%80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c4_10 = arith.constant 4 : index
%81 = arith.muli %c16, %c4_10 : index
%82 = arith.addi %arg2, %81 : index
%83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
%c4_11 = arith.constant 4 : index
%c0_12 = arith.constant 0 : index
%84 = arith.select %57, %c4_11, %c0_12 : index
%85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMGPUPackSharedMemoryAllocPass (iree-llvmgpu-pack-shared-memory-alloc) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%c0_2 = arith.constant 0 : index
%12 = arith.muli %c16, %c0_2 : index
%13 = arith.addi %c0, %12 : index
gpu.barrier {__pipelining_first_stage__}
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
%15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
%16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
%c1 = arith.constant 1 : index
%20 = arith.muli %c16, %c1 : index
%21 = arith.addi %c0, %20 : index
gpu.barrier {__pipelining_first_stage__}
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
%24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
%26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
%c2 = arith.constant 2 : index
%28 = arith.muli %c16, %c2 : index
%29 = arith.addi %c0, %28 : index
gpu.barrier {__pipelining_first_stage__}
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
%31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
%32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
%34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
%c3 = arith.constant 3 : index
%36 = arith.muli %c16, %c3 : index
%37 = arith.addi %c0, %36 : index
gpu.barrier {__pipelining_first_stage__}
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
%39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
%40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
%42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
%44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%c4 = arith.constant 4 : index
%55 = arith.muli %c16, %c4 : index
%56 = arith.subi %c1024, %55 : index
%57 = arith.cmpi slt, %arg2, %56 : index
%c3_3 = arith.constant 3 : index
%58 = arith.muli %c16, %c3_3 : index
%59 = arith.subi %c1024, %58 : index
%60 = arith.cmpi slt, %arg2, %59 : index
%c2_4 = arith.constant 2 : index
%61 = arith.muli %c16, %c2_4 : index
%62 = arith.subi %c1024, %61 : index
%63 = arith.cmpi slt, %arg2, %62 : index
%c1_5 = arith.constant 1 : index
%64 = arith.muli %c16, %c1_5 : index
%65 = arith.subi %c1024, %64 : index
%66 = arith.cmpi slt, %arg2, %65 : index
nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
gpu.barrier
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%c4_6 = arith.constant 4 : index
%73 = arith.muli %c16, %c4_6 : index
%74 = arith.addi %arg2, %73 : index
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
%c4_7 = arith.constant 4 : index
%76 = arith.muli %c16, %c4_7 : index
%77 = arith.addi %arg2, %76 : index
%78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%79 = arith.select %57, %c4_8, %c0_9 : index
%80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c4_10 = arith.constant 4 : index
%81 = arith.muli %c16, %c4_10 : index
%82 = arith.addi %arg2, %81 : index
%83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
%c4_11 = arith.constant 4 : index
%c0_12 = arith.constant 0 : index
%84 = arith.select %57, %c4_11, %c0_12 : index
%85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- //
func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%c0_2 = arith.constant 0 : index
%12 = arith.muli %c16, %c0_2 : index
%13 = arith.addi %c0, %12 : index
gpu.barrier {__pipelining_first_stage__}
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
%15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
%16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
%c1 = arith.constant 1 : index
%20 = arith.muli %c16, %c1 : index
%21 = arith.addi %c0, %20 : index
gpu.barrier {__pipelining_first_stage__}
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
%24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
%26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
%c2 = arith.constant 2 : index
%28 = arith.muli %c16, %c2 : index
%29 = arith.addi %c0, %28 : index
gpu.barrier {__pipelining_first_stage__}
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
%31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
%32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
%34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
%c3 = arith.constant 3 : index
%36 = arith.muli %c16, %c3 : index
%37 = arith.addi %c0, %36 : index
gpu.barrier {__pipelining_first_stage__}
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
%39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
%40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
%42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
%44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%c4 = arith.constant 4 : index
%55 = arith.muli %c16, %c4 : index
%56 = arith.subi %c1024, %55 : index
%57 = arith.cmpi slt, %arg2, %56 : index
%c3_3 = arith.constant 3 : index
%58 = arith.muli %c16, %c3_3 : index
%59 = arith.subi %c1024, %58 : index
%60 = arith.cmpi slt, %arg2, %59 : index
%c2_4 = arith.constant 2 : index
%61 = arith.muli %c16, %c2_4 : index
%62 = arith.subi %c1024, %61 : index
%63 = arith.cmpi slt, %arg2, %62 : index
%c1_5 = arith.constant 1 : index
%64 = arith.muli %c16, %c1_5 : index
%65 = arith.subi %c1024, %64 : index
%66 = arith.cmpi slt, %arg2, %65 : index
nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
gpu.barrier
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%c4_6 = arith.constant 4 : index
%73 = arith.muli %c16, %c4_6 : index
%74 = arith.addi %arg2, %73 : index
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
%c4_7 = arith.constant 4 : index
%76 = arith.muli %c16, %c4_7 : index
%77 = arith.addi %arg2, %76 : index
%78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%79 = arith.select %57, %c4_8, %c0_9 : index
%80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c4_10 = arith.constant 4 : index
%81 = arith.muli %c16, %c4_10 : index
%82 = arith.addi %arg2, %81 : index
%83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
%c4_11 = arith.constant 4 : index
%c0_12 = arith.constant 0 : index
%84 = arith.select %57, %c4_11, %c0_12 : index
%85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- //
hal.executable.variant public @cuda target(<"cuda", "cuda-nvptx-fb">) {
hal.executable.export public @dot_dispatch_0 layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) attributes {subgroup_size = 32 : index, workgroup_size = [64 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @dot_dispatch_0() {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
%10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%c0_2 = arith.constant 0 : index
%14 = arith.muli %c16, %c0_2 : index
%15 = arith.addi %c0, %14 : index
gpu.barrier {__pipelining_first_stage__}
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%15, %thread_id_x]
%17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%15)
%18 = nvgpu.device_async_copy %1[%6, %16], %alloc_0[%17, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%19 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%15, %thread_id_x, %thread_id_y, %thread_id_z]
%20 = nvgpu.device_async_copy %2[%19, %9], %alloc_1[%17, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
%c1 = arith.constant 1 : index
%22 = arith.muli %c16, %c1 : index
%23 = arith.addi %c0, %22 : index
gpu.barrier {__pipelining_first_stage__}
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%23, %thread_id_x]
%25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%23)
%26 = nvgpu.device_async_copy %1[%6, %24], %alloc_0[%25, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%23, %thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %9], %alloc_1[%25, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%c2 = arith.constant 2 : index
%30 = arith.muli %c16, %c2 : index
%31 = arith.addi %c0, %30 : index
gpu.barrier {__pipelining_first_stage__}
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%31, %thread_id_x]
%33 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%31)
%34 = nvgpu.device_async_copy %1[%6, %32], %alloc_0[%33, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%31, %thread_id_x, %thread_id_y, %thread_id_z]
%36 = nvgpu.device_async_copy %2[%35, %9], %alloc_1[%33, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_create_group %34, %36 {__pipelining_first_stage__}
%c3 = arith.constant 3 : index
%38 = arith.muli %c16, %c3 : index
%39 = arith.addi %c0, %38 : index
gpu.barrier {__pipelining_first_stage__}
%40 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%39, %thread_id_x]
%41 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%39)
%42 = nvgpu.device_async_copy %1[%6, %40], %alloc_0[%41, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%43 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%39, %thread_id_x, %thread_id_y, %thread_id_z]
%44 = nvgpu.device_async_copy %2[%43, %9], %alloc_1[%41, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%45 = nvgpu.device_async_create_group %42, %44 {__pipelining_first_stage__}
%46:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %21, %arg3 = %29, %arg4 = %37, %arg5 = %45, %arg6 = %17, %arg7 = %25, %arg8 = %33, %arg9 = %41) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%c4 = arith.constant 4 : index
%57 = arith.muli %c16, %c4 : index
%58 = arith.subi %c1024, %57 : index
%59 = arith.cmpi slt, %arg0, %58 : index
%c3_3 = arith.constant 3 : index
%60 = arith.muli %c16, %c3_3 : index
%61 = arith.subi %c1024, %60 : index
%62 = arith.cmpi slt, %arg0, %61 : index
%c2_4 = arith.constant 2 : index
%63 = arith.muli %c16, %c2_4 : index
%64 = arith.subi %c1024, %63 : index
%65 = arith.cmpi slt, %arg0, %64 : index
%c1_5 = arith.constant 1 : index
%66 = arith.muli %c16, %c1_5 : index
%67 = arith.subi %c1024, %66 : index
%68 = arith.cmpi slt, %arg0, %67 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%69 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%70 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%71 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%72 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%73 = gpu.subgroup_mma_compute %69, %71, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%74 = gpu.subgroup_mma_compute %70, %72, %73 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%c4_6 = arith.constant 4 : index
%75 = arith.muli %c16, %c4_6 : index
%76 = arith.addi %arg0, %75 : index
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%76, %thread_id_x]
%c4_7 = arith.constant 4 : index
%78 = arith.muli %c16, %c4_7 : index
%79 = arith.addi %arg0, %78 : index
%80 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%79)
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%81 = arith.select %59, %c4_8, %c0_9 : index
%82 = nvgpu.device_async_copy %1[%6, %77], %alloc_0[%80, %7, %8], 4, %81 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c4_10 = arith.constant 4 : index
%83 = arith.muli %c16, %c4_10 : index
%84 = arith.addi %arg0, %83 : index
%85 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%84, %thread_id_x, %thread_id_y, %thread_id_z]
%c4_11 = arith.constant 4 : index
%c0_12 = arith.constant 0 : index
%86 = arith.select %59, %c4_11, %c0_12 : index
%87 = nvgpu.device_async_copy %2[%85, %9], %alloc_1[%80, %10, %11], 4, %86 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%88 = nvgpu.device_async_create_group %82, %87 {__pipelining_first_stage__}
scf.yield %74, %arg3, %arg4, %arg5, %88, %arg7, %arg8, %arg9, %80 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%47 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %46#0, %alloc[%47, %48] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%49 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%50 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%51 = vector.transfer_read %alloc[%49, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%52 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
%53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
vector.transfer_write %51, %3[%52, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
%54 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%55 = vector.transfer_read %alloc[%54, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %55, %3[%56, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}
}
}
// -----// IR Dump After ConvertHALDescriptorTypeToGPUAddressSpacePass (iree-codegen-convert-hal-descriptor-type-to-gpu-address-space) //----- //
module {
func.func @dot_dispatch_0() {
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
%7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
%10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
%c0_2 = arith.constant 0 : index
%14 = arith.muli %c16, %c0_2 : index
%15 = arith.addi %c0, %14 : index
gpu.barrier {__pipelining_first_stage__}
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%15, %thread_id_x]
%17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%15)
%18 = nvgpu.device_async_copy %1[%6, %16], %alloc_0[%17, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%19 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%15, %thread_id_x, %thread_id_y, %thread_id_z]
%20 = nvgpu.device_async_copy %2[%19, %9], %alloc_1[%17, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
%c1 = arith.constant 1 : index
%22 = arith.muli %c16, %c1 : index
%23 = arith.addi %c0, %22 : index
gpu.barrier {__pipelining_first_stage__}
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%23, %thread_id_x]
%25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%23)
%26 = nvgpu.device_async_copy %1[%6, %24], %alloc_0[%25, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%23, %thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %9], %alloc_1[%25, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%c2 = arith.constant 2 : index
%30 = arith.muli %c16, %c2 : index
%31 = arith.addi %c0, %30 : index
gpu.barrier {__pipelining_first_stage__}
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%31, %thread_id_x]
%33 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%31)
%34 = nvgpu.device_async_copy %1[%6, %32], %alloc_0[%33, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%31, %thread_id_x, %thread_id_y, %thread_id_z]
%36 = nvgpu.device_async_copy %2[%35, %9], %alloc_1[%33, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_create_group %34, %36 {__pipelining_first_stage__}
%c3 = arith.constant 3 : index
%38 = arith.muli %c16, %c3 : index
%39 = arith.addi %c0, %38 : index
gpu.barrier {__pipelining_first_stage__}
%40 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%39, %thread_id_x]
%41 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%39)
%42 = nvgpu.device_async_copy %1[%6, %40], %alloc_0[%41, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%43 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%39, %thread_id_x, %thread_id_y, %thread_id_z]
%44 = nvgpu.device_async_copy %2[%43, %9], %alloc_1[%41, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%45 = nvgpu.device_async_create_group %42, %44 {__pipelining_first_stage__}
%46:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %21, %arg3 = %29, %arg4 = %37, %arg5 = %45, %arg6 = %17, %arg7 = %25, %arg8 = %33, %arg9 = %41) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%c4 = arith.constant 4 : index
%57 = arith.muli %c16, %c4 : index
%58 = arith.subi %c1024, %57 : index
%59 = arith.cmpi slt, %arg0, %58 : index
%c3_3 = arith.constant 3 : index
%60 = arith.muli %c16, %c3_3 : index
%61 = arith.subi %c1024, %60 : index
%62 = arith.cmpi slt, %arg0, %61 : index
%c2_4 = arith.constant 2 : index
%63 = arith.muli %c16, %c2_4 : index
%64 = arith.subi %c1024, %63 : index
%65 = arith.cmpi slt, %arg0, %64 : index
%c1_5 = arith.constant 1 : index
%66 = arith.muli %c16, %c1_5 : index
%67 = arith.subi %c1024, %66 : index
%68 = arith.cmpi slt, %arg0, %67 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%69 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%70 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%71 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%72 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%73 = gpu.subgroup_mma_compute %69, %71, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%74 = gpu.subgroup_mma_compute %70, %72, %73 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%c4_6 = arith.constant 4 : index
%75 = arith.muli %c16, %c4_6 : index
%76 = arith.addi %arg0, %75 : index
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%76, %thread_id_x]
%c4_7 = arith.constant 4 : index
%78 = arith.muli %c16, %c4_7 : index
%79 = arith.addi %arg0, %78 : index
%80 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%79)
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%81 = arith.select %59, %c4_8, %c0_9 : index
%82 = nvgpu.device_async_copy %1[%6, %77], %alloc_0[%80, %7, %8], 4, %81 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c4_10 = arith.constant 4 : index
%83 = arith.muli %c16, %c4_10 : index
%84 = arith.addi %arg0, %83 : index
%85 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%84, %thread_id_x, %thread_id_y, %thread_id_z]
%c4_11 = arith.constant 4 : index
%c0_12 = arith.constant 0 : index
%86 = arith.select %59, %c4_11, %c0_12 : index
%87 = nvgpu.device_async_copy %2[%85, %9], %alloc_1[%80, %10, %11], 4, %86 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%88 = nvgpu.device_async_create_group %82, %87 {__pipelining_first_stage__}
scf.yield %74, %arg3, %arg4, %arg5, %88, %arg7, %arg8, %arg9, %80 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%47 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %46#0, %alloc[%47, %48] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%49 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%50 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%51 = vector.transfer_read %alloc[%49, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%52 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
%53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
vector.transfer_write %51, %3[%52, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%54 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%55 = vector.transfer_read %alloc[%54, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
vector.transfer_write %55, %3[%56, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%13 = nvgpu.device_async_copy %1[%4, %12], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%15 = nvgpu.device_async_copy %2[%14, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%16 = nvgpu.device_async_create_group %13, %15 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%17 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%18 = nvgpu.device_async_copy %1[%4, %17], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%20 = nvgpu.device_async_copy %2[%19, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%22 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%23 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%25 = nvgpu.device_async_copy %2[%24, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%26 = nvgpu.device_async_create_group %23, %25 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%28 = nvgpu.device_async_copy %1[%4, %27], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%29 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%30 = nvgpu.device_async_copy %2[%29, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%31 = nvgpu.device_async_create_group %28, %30 {__pipelining_first_stage__}
%32:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %16, %arg3 = %21, %arg4 = %26, %arg5 = %31, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%43 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%44 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%45 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%46 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%47 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%48 = gpu.subgroup_mma_compute %44, %46, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%49 = gpu.subgroup_mma_compute %45, %47, %48 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%50 = arith.addi %arg0, %c64 : index
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%50, %thread_id_x]
%52 = arith.addi %arg0, %c64 : index
%53 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%52)
%54 = arith.select %43, %c4, %c0 : index
%55 = nvgpu.device_async_copy %1[%4, %51], %alloc_0[%53, %5, %6], 4, %54 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%56 = arith.addi %arg0, %c64 : index
%57 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%56, %thread_id_x, %thread_id_y, %thread_id_z]
%58 = arith.select %43, %c4, %c0 : index
%59 = nvgpu.device_async_copy %2[%57, %7], %alloc_1[%53, %8, %9], 4, %58 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%60 = nvgpu.device_async_create_group %55, %59 {__pipelining_first_stage__}
scf.yield %49, %arg3, %arg4, %arg5, %60, %arg7, %arg8, %arg9, %53 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
%33 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.subgroup_mma_store_matrix %32#0, %alloc[%33, %34] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%35 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%36 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%37 = vector.transfer_read %alloc[%35, %36], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%38 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%39 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
vector.transfer_write %37, %3[%38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%40 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%41 = vector.transfer_read %alloc[%40, %36], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%42 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %41, %3[%42, %39] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
}
// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
}
// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After PadDynamicAllocPass (iree-codegen-pad-dynamic-alloc) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
%10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
gpu.barrier {__pipelining_first_stage__}
%12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
%16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
%21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
%26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
%30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%35 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%42 = arith.addi %arg0, %c64 : index
%43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
%44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
%45 = arith.select %35, %c4, %c0 : index
%46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
%48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
func.func @dot_dispatch_0() {
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%c16_2 = arith.constant 16 : index
%4 = arith.muli %thread_id_y, %c16_2 : index
%c32 = arith.constant 32 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%c32_3 = arith.constant 32 : index
%7 = arith.muli %workgroup_id_y, %c32_3 : index
%8 = arith.addi %6, %7 : index
%c4_4 = arith.constant 4 : index
%c0_5 = arith.constant 0 : index
%c-1 = arith.constant -1 : index
%9 = arith.cmpi slt, %thread_id_x, %c0_5 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4_4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%c16_6 = arith.constant 16 : index
%16 = arith.muli %thread_id_y, %c16_6 : index
%c32_7 = arith.constant 32 : index
%17 = arith.muli %thread_id_z, %c32_7 : index
%18 = arith.addi %16, %17 : index
%c4_8 = arith.constant 4 : index
%c0_9 = arith.constant 0 : index
%c-1_10 = arith.constant -1 : index
%19 = arith.cmpi slt, %thread_id_x, %c0_9 : index
%20 = arith.subi %c-1_10, %thread_id_x : index
%21 = arith.select %19, %20, %thread_id_x : index
%22 = arith.divsi %21, %c4_8 : index
%23 = arith.subi %c-1_10, %22 : index
%24 = arith.select %19, %23, %22 : index
%25 = arith.addi %18, %24 : index
%c4_11 = arith.constant 4 : index
%26 = arith.muli %thread_id_x, %c4_11 : index
%c4_12 = arith.constant 4 : index
%c0_13 = arith.constant 0 : index
%c-1_14 = arith.constant -1 : index
%27 = arith.cmpi slt, %thread_id_x, %c0_13 : index
%28 = arith.subi %c-1_14, %thread_id_x : index
%29 = arith.select %27, %28, %thread_id_x : index
%30 = arith.divsi %29, %c4_12 : index
%31 = arith.subi %c-1_14, %30 : index
%32 = arith.select %27, %31, %30 : index
%c-16 = arith.constant -16 : index
%33 = arith.muli %32, %c-16 : index
%34 = arith.addi %26, %33 : index
%c4_15 = arith.constant 4 : index
%35 = arith.muli %thread_id_x, %c4_15 : index
%c32_16 = arith.constant 32 : index
%36 = arith.muli %workgroup_id_x, %c32_16 : index
%37 = arith.addi %35, %36 : index
%c8_17 = arith.constant 8 : index
%c0_18 = arith.constant 0 : index
%c-1_19 = arith.constant -1 : index
%38 = arith.cmpi slt, %thread_id_x, %c0_18 : index
%39 = arith.subi %c-1_19, %thread_id_x : index
%40 = arith.select %38, %39, %thread_id_x : index
%41 = arith.divsi %40, %c8_17 : index
%42 = arith.subi %c-1_19, %41 : index
%43 = arith.select %38, %42, %41 : index
%c-32 = arith.constant -32 : index
%44 = arith.muli %43, %c-32 : index
%45 = arith.addi %37, %44 : index
%c8_20 = arith.constant 8 : index
%46 = arith.muli %thread_id_y, %c8_20 : index
%c16_21 = arith.constant 16 : index
%47 = arith.muli %thread_id_z, %c16_21 : index
%48 = arith.addi %46, %47 : index
%c8_22 = arith.constant 8 : index
%c0_23 = arith.constant 0 : index
%c-1_24 = arith.constant -1 : index
%49 = arith.cmpi slt, %thread_id_x, %c0_23 : index
%50 = arith.subi %c-1_24, %thread_id_x : index
%51 = arith.select %49, %50, %thread_id_x : index
%52 = arith.divsi %51, %c8_22 : index
%53 = arith.subi %c-1_24, %52 : index
%54 = arith.select %49, %53, %52 : index
%55 = arith.addi %48, %54 : index
%c4_25 = arith.constant 4 : index
%56 = arith.muli %thread_id_x, %c4_25 : index
%c8_26 = arith.constant 8 : index
%c0_27 = arith.constant 0 : index
%c-1_28 = arith.constant -1 : index
%57 = arith.cmpi slt, %thread_id_x, %c0_27 : index
%58 = arith.subi %c-1_28, %thread_id_x : index
%59 = arith.select %57, %58, %thread_id_x : index
%60 = arith.divsi %59, %c8_26 : index
%61 = arith.subi %c-1_28, %60 : index
%62 = arith.select %57, %61, %60 : index
%c-32_29 = arith.constant -32 : index
%63 = arith.muli %62, %c-32_29 : index
%64 = arith.addi %56, %63 : index
%c16_30 = arith.constant 16 : index
%65 = arith.muli %thread_id_y, %c16_30 : index
%c32_31 = arith.constant 32 : index
%c0_32 = arith.constant 0 : index
%c-1_33 = arith.constant -1 : index
%66 = arith.cmpi slt, %thread_id_x, %c0_32 : index
%67 = arith.subi %c-1_33, %thread_id_x : index
%68 = arith.select %66, %67, %thread_id_x : index
%69 = arith.divsi %68, %c32_31 : index
%70 = arith.subi %c-1_33, %69 : index
%71 = arith.select %66, %70, %69 : index
%c16_34 = arith.constant 16 : index
%72 = arith.muli %71, %c16_34 : index
gpu.barrier {__pipelining_first_stage__}
%73 = nvgpu.device_async_copy %1[%15, %34], %alloc_0[%c0, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%74 = nvgpu.device_async_copy %2[%55, %45], %alloc_1[%c0, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%75 = nvgpu.device_async_create_group %73, %74 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%c4_35 = arith.constant 4 : index
%76 = arith.muli %thread_id_x, %c4_35 : index
%c4_36 = arith.constant 4 : index
%c0_37 = arith.constant 0 : index
%c-1_38 = arith.constant -1 : index
%77 = arith.cmpi slt, %thread_id_x, %c0_37 : index
%78 = arith.subi %c-1_38, %thread_id_x : index
%79 = arith.select %77, %78, %thread_id_x : index
%80 = arith.divsi %79, %c4_36 : index
%81 = arith.subi %c-1_38, %80 : index
%82 = arith.select %77, %81, %80 : index
%c-16_39 = arith.constant -16 : index
%83 = arith.muli %82, %c-16_39 : index
%84 = arith.addi %76, %83 : index
%c16_40 = arith.constant 16 : index
%85 = arith.addi %84, %c16_40 : index
%86 = nvgpu.device_async_copy %1[%15, %85], %alloc_0[%c1, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c8_41 = arith.constant 8 : index
%87 = arith.muli %thread_id_y, %c8_41 : index
%c16_42 = arith.constant 16 : index
%88 = arith.muli %thread_id_z, %c16_42 : index
%89 = arith.addi %87, %88 : index
%c8_43 = arith.constant 8 : index
%c0_44 = arith.constant 0 : index
%c-1_45 = arith.constant -1 : index
%90 = arith.cmpi slt, %thread_id_x, %c0_44 : index
%91 = arith.subi %c-1_45, %thread_id_x : index
%92 = arith.select %90, %91, %thread_id_x : index
%93 = arith.divsi %92, %c8_43 : index
%94 = arith.subi %c-1_45, %93 : index
%95 = arith.select %90, %94, %93 : index
%96 = arith.addi %89, %95 : index
%c16_46 = arith.constant 16 : index
%97 = arith.addi %96, %c16_46 : index
%98 = nvgpu.device_async_copy %2[%97, %45], %alloc_1[%c1, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%99 = nvgpu.device_async_create_group %86, %98 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%c4_47 = arith.constant 4 : index
%100 = arith.muli %thread_id_x, %c4_47 : index
%c4_48 = arith.constant 4 : index
%c0_49 = arith.constant 0 : index
%c-1_50 = arith.constant -1 : index
%101 = arith.cmpi slt, %thread_id_x, %c0_49 : index
%102 = arith.subi %c-1_50, %thread_id_x : index
%103 = arith.select %101, %102, %thread_id_x : index
%104 = arith.divsi %103, %c4_48 : index
%105 = arith.subi %c-1_50, %104 : index
%106 = arith.select %101, %105, %104 : index
%c-16_51 = arith.constant -16 : index
%107 = arith.muli %106, %c-16_51 : index
%108 = arith.addi %100, %107 : index
%c32_52 = arith.constant 32 : index
%109 = arith.addi %108, %c32_52 : index
%110 = nvgpu.device_async_copy %1[%15, %109], %alloc_0[%c2, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c8_53 = arith.constant 8 : index
%111 = arith.muli %thread_id_y, %c8_53 : index
%c16_54 = arith.constant 16 : index
%112 = arith.muli %thread_id_z, %c16_54 : index
%113 = arith.addi %111, %112 : index
%c8_55 = arith.constant 8 : index
%c0_56 = arith.constant 0 : index
%c-1_57 = arith.constant -1 : index
%114 = arith.cmpi slt, %thread_id_x, %c0_56 : index
%115 = arith.subi %c-1_57, %thread_id_x : index
%116 = arith.select %114, %115, %thread_id_x : index
%117 = arith.divsi %116, %c8_55 : index
%118 = arith.subi %c-1_57, %117 : index
%119 = arith.select %114, %118, %117 : index
%120 = arith.addi %113, %119 : index
%c32_58 = arith.constant 32 : index
%121 = arith.addi %120, %c32_58 : index
%122 = nvgpu.device_async_copy %2[%121, %45], %alloc_1[%c2, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%123 = nvgpu.device_async_create_group %110, %122 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%c4_59 = arith.constant 4 : index
%124 = arith.muli %thread_id_x, %c4_59 : index
%c4_60 = arith.constant 4 : index
%c0_61 = arith.constant 0 : index
%c-1_62 = arith.constant -1 : index
%125 = arith.cmpi slt, %thread_id_x, %c0_61 : index
%126 = arith.subi %c-1_62, %thread_id_x : index
%127 = arith.select %125, %126, %thread_id_x : index
%128 = arith.divsi %127, %c4_60 : index
%129 = arith.subi %c-1_62, %128 : index
%130 = arith.select %125, %129, %128 : index
%c-16_63 = arith.constant -16 : index
%131 = arith.muli %130, %c-16_63 : index
%132 = arith.addi %124, %131 : index
%c48 = arith.constant 48 : index
%133 = arith.addi %132, %c48 : index
%134 = nvgpu.device_async_copy %1[%15, %133], %alloc_0[%c3, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c8_64 = arith.constant 8 : index
%135 = arith.muli %thread_id_y, %c8_64 : index
%c16_65 = arith.constant 16 : index
%136 = arith.muli %thread_id_z, %c16_65 : index
%137 = arith.addi %135, %136 : index
%c8_66 = arith.constant 8 : index
%c0_67 = arith.constant 0 : index
%c-1_68 = arith.constant -1 : index
%138 = arith.cmpi slt, %thread_id_x, %c0_67 : index
%139 = arith.subi %c-1_68, %thread_id_x : index
%140 = arith.select %138, %139, %thread_id_x : index
%141 = arith.divsi %140, %c8_66 : index
%142 = arith.subi %c-1_68, %141 : index
%143 = arith.select %138, %142, %141 : index
%144 = arith.addi %137, %143 : index
%c48_69 = arith.constant 48 : index
%145 = arith.addi %144, %c48_69 : index
%146 = nvgpu.device_async_copy %2[%145, %45], %alloc_1[%c3, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%147 = nvgpu.device_async_create_group %134, %146 {__pipelining_first_stage__}
%148:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %75, %arg3 = %99, %arg4 = %123, %arg5 = %147, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%176 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%177 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%178 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%179 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%180 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%181 = gpu.subgroup_mma_compute %177, %179, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%182 = gpu.subgroup_mma_compute %178, %180, %181 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%183 = arith.addi %arg0, %c64 : index
%c4_83 = arith.constant 4 : index
%184 = arith.muli %thread_id_x, %c4_83 : index
%185 = arith.addi %183, %184 : index
%c4_84 = arith.constant 4 : index
%c0_85 = arith.constant 0 : index
%c-1_86 = arith.constant -1 : index
%186 = arith.cmpi slt, %thread_id_x, %c0_85 : index
%187 = arith.subi %c-1_86, %thread_id_x : index
%188 = arith.select %186, %187, %thread_id_x : index
%189 = arith.divsi %188, %c4_84 : index
%190 = arith.subi %c-1_86, %189 : index
%191 = arith.select %186, %190, %189 : index
%c-16_87 = arith.constant -16 : index
%192 = arith.muli %191, %c-16_87 : index
%193 = arith.addi %185, %192 : index
%c16_88 = arith.constant 16 : index
%c0_89 = arith.constant 0 : index
%c-1_90 = arith.constant -1 : index
%194 = arith.cmpi slt, %183, %c0_89 : index
%195 = arith.subi %c-1_90, %183 : index
%196 = arith.select %194, %195, %183 : index
%197 = arith.divsi %196, %c16_88 : index
%198 = arith.subi %c-1_90, %197 : index
%199 = arith.select %194, %198, %197 : index
%c4_91 = arith.constant 4 : index
%200 = arith.remsi %199, %c4_91 : index
%c0_92 = arith.constant 0 : index
%201 = arith.cmpi slt, %200, %c0_92 : index
%202 = arith.addi %200, %c4_91 : index
%203 = arith.select %201, %202, %200 : index
%204 = arith.select %176, %c4, %c0 : index
%205 = nvgpu.device_async_copy %1[%15, %193], %alloc_0[%203, %25, %34], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%c8_93 = arith.constant 8 : index
%206 = arith.muli %thread_id_y, %c8_93 : index
%207 = arith.addi %183, %206 : index
%c16_94 = arith.constant 16 : index
%208 = arith.muli %thread_id_z, %c16_94 : index
%209 = arith.addi %207, %208 : index
%c8_95 = arith.constant 8 : index
%c0_96 = arith.constant 0 : index
%c-1_97 = arith.constant -1 : index
%210 = arith.cmpi slt, %thread_id_x, %c0_96 : index
%211 = arith.subi %c-1_97, %thread_id_x : index
%212 = arith.select %210, %211, %thread_id_x : index
%213 = arith.divsi %212, %c8_95 : index
%214 = arith.subi %c-1_97, %213 : index
%215 = arith.select %210, %214, %213 : index
%216 = arith.addi %209, %215 : index
%217 = nvgpu.device_async_copy %2[%216, %45], %alloc_1[%203, %55, %64], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%218 = nvgpu.device_async_create_group %205, %217 {__pipelining_first_stage__}
scf.yield %182, %arg3, %arg4, %arg5, %218, %arg7, %arg8, %arg9, %203 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %148#0, %alloc[%65, %72] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%149 = vector.transfer_read %alloc[%55, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%c8_70 = arith.constant 8 : index
%150 = arith.muli %thread_id_y, %c8_70 : index
%c16_71 = arith.constant 16 : index
%151 = arith.muli %thread_id_z, %c16_71 : index
%152 = arith.addi %150, %151 : index
%c32_72 = arith.constant 32 : index
%153 = arith.muli %workgroup_id_y, %c32_72 : index
%154 = arith.addi %152, %153 : index
%c8_73 = arith.constant 8 : index
%c0_74 = arith.constant 0 : index
%c-1_75 = arith.constant -1 : index
%155 = arith.cmpi slt, %thread_id_x, %c0_74 : index
%156 = arith.subi %c-1_75, %thread_id_x : index
%157 = arith.select %155, %156, %thread_id_x : index
%158 = arith.divsi %157, %c8_73 : index
%159 = arith.subi %c-1_75, %158 : index
%160 = arith.select %155, %159, %158 : index
%161 = arith.addi %154, %160 : index
vector.transfer_write %149, %3[%161, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%162 = vector.transfer_read %alloc[%97, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%c8_76 = arith.constant 8 : index
%163 = arith.muli %thread_id_y, %c8_76 : index
%c16_77 = arith.constant 16 : index
%164 = arith.muli %thread_id_z, %c16_77 : index
%165 = arith.addi %163, %164 : index
%c32_78 = arith.constant 32 : index
%166 = arith.muli %workgroup_id_y, %c32_78 : index
%167 = arith.addi %165, %166 : index
%c8_79 = arith.constant 8 : index
%c0_80 = arith.constant 0 : index
%c-1_81 = arith.constant -1 : index
%168 = arith.cmpi slt, %thread_id_x, %c0_80 : index
%169 = arith.subi %c-1_81, %thread_id_x : index
%170 = arith.select %168, %169, %thread_id_x : index
%171 = arith.divsi %170, %c8_79 : index
%172 = arith.subi %c-1_81, %171 : index
%173 = arith.select %168, %172, %171 : index
%174 = arith.addi %167, %173 : index
%c16_82 = arith.constant 16 : index
%175 = arith.addi %174, %c16_82 : index
vector.transfer_write %162, %3[%175, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.muli %thread_id_y, %c16 : index
%17 = arith.muli %thread_id_z, %c32 : index
%18 = arith.addi %16, %17 : index
%19 = arith.cmpi slt, %thread_id_x, %c0 : index
%20 = arith.subi %c-1, %thread_id_x : index
%21 = arith.select %19, %20, %thread_id_x : index
%22 = arith.divsi %21, %c4 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %19, %23, %22 : index
%25 = arith.addi %18, %24 : index
%26 = arith.muli %thread_id_x, %c4 : index
%27 = arith.cmpi slt, %thread_id_x, %c0 : index
%28 = arith.subi %c-1, %thread_id_x : index
%29 = arith.select %27, %28, %thread_id_x : index
%30 = arith.divsi %29, %c4 : index
%31 = arith.subi %c-1, %30 : index
%32 = arith.select %27, %31, %30 : index
%33 = arith.muli %32, %c-16 : index
%34 = arith.addi %26, %33 : index
%35 = arith.muli %thread_id_x, %c4 : index
%36 = arith.muli %workgroup_id_x, %c32 : index
%37 = arith.addi %35, %36 : index
%38 = arith.cmpi slt, %thread_id_x, %c0 : index
%39 = arith.subi %c-1, %thread_id_x : index
%40 = arith.select %38, %39, %thread_id_x : index
%41 = arith.divsi %40, %c8 : index
%42 = arith.subi %c-1, %41 : index
%43 = arith.select %38, %42, %41 : index
%44 = arith.muli %43, %c-32 : index
%45 = arith.addi %37, %44 : index
%46 = arith.muli %thread_id_y, %c8 : index
%47 = arith.muli %thread_id_z, %c16 : index
%48 = arith.addi %46, %47 : index
%49 = arith.cmpi slt, %thread_id_x, %c0 : index
%50 = arith.subi %c-1, %thread_id_x : index
%51 = arith.select %49, %50, %thread_id_x : index
%52 = arith.divsi %51, %c8 : index
%53 = arith.subi %c-1, %52 : index
%54 = arith.select %49, %53, %52 : index
%55 = arith.addi %48, %54 : index
%56 = arith.muli %thread_id_x, %c4 : index
%57 = arith.cmpi slt, %thread_id_x, %c0 : index
%58 = arith.subi %c-1, %thread_id_x : index
%59 = arith.select %57, %58, %thread_id_x : index
%60 = arith.divsi %59, %c8 : index
%61 = arith.subi %c-1, %60 : index
%62 = arith.select %57, %61, %60 : index
%63 = arith.muli %62, %c-32 : index
%64 = arith.addi %56, %63 : index
%65 = arith.muli %thread_id_y, %c16 : index
%66 = arith.cmpi slt, %thread_id_x, %c0 : index
%67 = arith.subi %c-1, %thread_id_x : index
%68 = arith.select %66, %67, %thread_id_x : index
%69 = arith.divsi %68, %c32 : index
%70 = arith.subi %c-1, %69 : index
%71 = arith.select %66, %70, %69 : index
%72 = arith.muli %71, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%73 = nvgpu.device_async_copy %1[%15, %34], %alloc_0[%c0, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%74 = nvgpu.device_async_copy %2[%55, %45], %alloc_1[%c0, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%75 = nvgpu.device_async_create_group %73, %74 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%76 = arith.muli %thread_id_x, %c4 : index
%77 = arith.cmpi slt, %thread_id_x, %c0 : index
%78 = arith.subi %c-1, %thread_id_x : index
%79 = arith.select %77, %78, %thread_id_x : index
%80 = arith.divsi %79, %c4 : index
%81 = arith.subi %c-1, %80 : index
%82 = arith.select %77, %81, %80 : index
%83 = arith.muli %82, %c-16 : index
%84 = arith.addi %76, %83 : index
%85 = arith.addi %84, %c16 : index
%86 = nvgpu.device_async_copy %1[%15, %85], %alloc_0[%c1, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.muli %thread_id_y, %c8 : index
%88 = arith.muli %thread_id_z, %c16 : index
%89 = arith.addi %87, %88 : index
%90 = arith.cmpi slt, %thread_id_x, %c0 : index
%91 = arith.subi %c-1, %thread_id_x : index
%92 = arith.select %90, %91, %thread_id_x : index
%93 = arith.divsi %92, %c8 : index
%94 = arith.subi %c-1, %93 : index
%95 = arith.select %90, %94, %93 : index
%96 = arith.addi %89, %95 : index
%97 = arith.addi %96, %c16 : index
%98 = nvgpu.device_async_copy %2[%97, %45], %alloc_1[%c1, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%99 = nvgpu.device_async_create_group %86, %98 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%100 = arith.muli %thread_id_x, %c4 : index
%101 = arith.cmpi slt, %thread_id_x, %c0 : index
%102 = arith.subi %c-1, %thread_id_x : index
%103 = arith.select %101, %102, %thread_id_x : index
%104 = arith.divsi %103, %c4 : index
%105 = arith.subi %c-1, %104 : index
%106 = arith.select %101, %105, %104 : index
%107 = arith.muli %106, %c-16 : index
%108 = arith.addi %100, %107 : index
%109 = arith.addi %108, %c32 : index
%110 = nvgpu.device_async_copy %1[%15, %109], %alloc_0[%c2, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%111 = arith.muli %thread_id_y, %c8 : index
%112 = arith.muli %thread_id_z, %c16 : index
%113 = arith.addi %111, %112 : index
%114 = arith.cmpi slt, %thread_id_x, %c0 : index
%115 = arith.subi %c-1, %thread_id_x : index
%116 = arith.select %114, %115, %thread_id_x : index
%117 = arith.divsi %116, %c8 : index
%118 = arith.subi %c-1, %117 : index
%119 = arith.select %114, %118, %117 : index
%120 = arith.addi %113, %119 : index
%121 = arith.addi %120, %c32 : index
%122 = nvgpu.device_async_copy %2[%121, %45], %alloc_1[%c2, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%123 = nvgpu.device_async_create_group %110, %122 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%124 = arith.muli %thread_id_x, %c4 : index
%125 = arith.cmpi slt, %thread_id_x, %c0 : index
%126 = arith.subi %c-1, %thread_id_x : index
%127 = arith.select %125, %126, %thread_id_x : index
%128 = arith.divsi %127, %c4 : index
%129 = arith.subi %c-1, %128 : index
%130 = arith.select %125, %129, %128 : index
%131 = arith.muli %130, %c-16 : index
%132 = arith.addi %124, %131 : index
%133 = arith.addi %132, %c48 : index
%134 = nvgpu.device_async_copy %1[%15, %133], %alloc_0[%c3, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%135 = arith.muli %thread_id_y, %c8 : index
%136 = arith.muli %thread_id_z, %c16 : index
%137 = arith.addi %135, %136 : index
%138 = arith.cmpi slt, %thread_id_x, %c0 : index
%139 = arith.subi %c-1, %thread_id_x : index
%140 = arith.select %138, %139, %thread_id_x : index
%141 = arith.divsi %140, %c8 : index
%142 = arith.subi %c-1, %141 : index
%143 = arith.select %138, %142, %141 : index
%144 = arith.addi %137, %143 : index
%145 = arith.addi %144, %c48 : index
%146 = nvgpu.device_async_copy %2[%145, %45], %alloc_1[%c3, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%147 = nvgpu.device_async_create_group %134, %146 {__pipelining_first_stage__}
%148:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %75, %arg3 = %99, %arg4 = %123, %arg5 = %147, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%176 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%177 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%178 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%179 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%180 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%181 = gpu.subgroup_mma_compute %177, %179, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%182 = gpu.subgroup_mma_compute %178, %180, %181 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%183 = arith.addi %arg0, %c64 : index
%184 = arith.muli %thread_id_x, %c4 : index
%185 = arith.addi %183, %184 : index
%186 = arith.cmpi slt, %thread_id_x, %c0 : index
%187 = arith.subi %c-1, %thread_id_x : index
%188 = arith.select %186, %187, %thread_id_x : index
%189 = arith.divsi %188, %c4 : index
%190 = arith.subi %c-1, %189 : index
%191 = arith.select %186, %190, %189 : index
%192 = arith.muli %191, %c-16 : index
%193 = arith.addi %185, %192 : index
%194 = arith.cmpi slt, %183, %c0 : index
%195 = arith.subi %c-65, %arg0 : index
%196 = arith.select %194, %195, %183 : index
%197 = arith.divsi %196, %c16 : index
%198 = arith.subi %c-1, %197 : index
%199 = arith.select %194, %198, %197 : index
%200 = arith.remsi %199, %c4 : index
%201 = arith.cmpi slt, %200, %c0 : index
%202 = arith.addi %200, %c4 : index
%203 = arith.select %201, %202, %200 : index
%204 = arith.select %176, %c4, %c0 : index
%205 = nvgpu.device_async_copy %1[%15, %193], %alloc_0[%203, %25, %34], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%206 = arith.muli %thread_id_y, %c8 : index
%207 = arith.addi %183, %206 : index
%208 = arith.muli %thread_id_z, %c16 : index
%209 = arith.addi %207, %208 : index
%210 = arith.cmpi slt, %thread_id_x, %c0 : index
%211 = arith.subi %c-1, %thread_id_x : index
%212 = arith.select %210, %211, %thread_id_x : index
%213 = arith.divsi %212, %c8 : index
%214 = arith.subi %c-1, %213 : index
%215 = arith.select %210, %214, %213 : index
%216 = arith.addi %209, %215 : index
%217 = nvgpu.device_async_copy %2[%216, %45], %alloc_1[%203, %55, %64], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%218 = nvgpu.device_async_create_group %205, %217 {__pipelining_first_stage__}
scf.yield %182, %arg3, %arg4, %arg5, %218, %arg7, %arg8, %arg9, %203 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %148#0, %alloc[%65, %72] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%149 = vector.transfer_read %alloc[%55, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%150 = arith.muli %thread_id_y, %c8 : index
%151 = arith.muli %thread_id_z, %c16 : index
%152 = arith.addi %150, %151 : index
%153 = arith.muli %workgroup_id_y, %c32 : index
%154 = arith.addi %152, %153 : index
%155 = arith.cmpi slt, %thread_id_x, %c0 : index
%156 = arith.subi %c-1, %thread_id_x : index
%157 = arith.select %155, %156, %thread_id_x : index
%158 = arith.divsi %157, %c8 : index
%159 = arith.subi %c-1, %158 : index
%160 = arith.select %155, %159, %158 : index
%161 = arith.addi %154, %160 : index
vector.transfer_write %149, %3[%161, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%162 = vector.transfer_read %alloc[%97, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%163 = arith.muli %thread_id_y, %c8 : index
%164 = arith.muli %thread_id_z, %c16 : index
%165 = arith.addi %163, %164 : index
%166 = arith.muli %workgroup_id_y, %c32 : index
%167 = arith.addi %165, %166 : index
%168 = arith.cmpi slt, %thread_id_x, %c0 : index
%169 = arith.subi %c-1, %thread_id_x : index
%170 = arith.select %168, %169, %thread_id_x : index
%171 = arith.divsi %170, %c8 : index
%172 = arith.subi %c-1, %171 : index
%173 = arith.select %168, %172, %171 : index
%174 = arith.addi %167, %173 : index
%175 = arith.addi %174, %c16 : index
vector.transfer_write %162, %3[%175, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
module {
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
}
// -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
%58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
gpu.barrier
return
}
// -----// IR Dump After LLVMGPUVectorLoweringPass (iree-llvmgpu-vector-lowering) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExpandGPUOpsPass (iree-codegen-expand-gpu-ops) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExtractAddressComputationGPUPass (extract-address-computation-gpu) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExpandOps (memref-expand) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExpandStridedMetadata (expand-strided-metadata) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After DecomposeAffineOpsPass (iree-codegen-decompose-affine-ops) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After GPUCheckResourceUsagePass (iree-codegen-gpu-check-resource-usage) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
%54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
%60 = arith.cmpi slt, %arg0, %c960 : index
nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
gpu.barrier
%61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%67 = arith.addi %arg0, %c64 : index
%68 = arith.addi %67, %17 : index
%69 = arith.addi %68, %18 : index
%70 = arith.cmpi slt, %67, %c0 : index
%71 = arith.subi %c-65, %arg0 : index
%72 = arith.select %70, %71, %67 : index
%73 = arith.divsi %72, %c16 : index
%74 = arith.subi %c-1, %73 : index
%75 = arith.select %70, %74, %73 : index
%76 = arith.remsi %75, %c4 : index
%77 = arith.cmpi slt, %76, %c0 : index
%78 = arith.addi %76, %c4 : index
%79 = arith.select %77, %78, %76 : index
%80 = arith.select %60, %c4, %c0 : index
%81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%82 = arith.addi %67, %27 : index
%83 = arith.addi %82, %28 : index
%84 = arith.addi %83, %24 : index
%85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
}
gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%56 = arith.addi %29, %7 : index
%57 = arith.addi %56, %24 : index
vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%59 = arith.addi %57, %c16 : index
vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ConvertComplexToStandard (convert-complex-to-standard) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExpandOps (memref-expand) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ExpandStridedMetadata (expand-strided-metadata) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
// -----// IR Dump After StripDebugInfo (strip-debuginfo) //----- //
module {
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
}
// -----// IR Dump After LLVMGPUCastAddressSpaceFunctionPass (iree-llvmgpu-cast-address-space-function) //----- //
module {
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
}
// -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- //
module {
func.func @dot_dispatch_0() {
%c-65 = arith.constant -65 : index
%c48 = arith.constant 48 : index
%c-32 = arith.constant -32 : index
%c-16 = arith.constant -16 : index
%c-1 = arith.constant -1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c960 = arith.constant 960 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
%alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = arith.muli %thread_id_y, %c16 : index
%5 = arith.muli %thread_id_z, %c32 : index
%6 = arith.addi %4, %5 : index
%7 = arith.muli %workgroup_id_y, %c32 : index
%8 = arith.addi %6, %7 : index
%9 = arith.cmpi slt, %thread_id_x, %c0 : index
%10 = arith.subi %c-1, %thread_id_x : index
%11 = arith.select %9, %10, %thread_id_x : index
%12 = arith.divsi %11, %c4 : index
%13 = arith.subi %c-1, %12 : index
%14 = arith.select %9, %13, %12 : index
%15 = arith.addi %8, %14 : index
%16 = arith.addi %6, %14 : index
%17 = arith.muli %thread_id_x, %c4 : index
%18 = arith.muli %14, %c-16 : index
%19 = arith.addi %17, %18 : index
%20 = arith.muli %workgroup_id_x, %c32 : index
%21 = arith.addi %17, %20 : index
%22 = arith.divsi %11, %c8 : index
%23 = arith.subi %c-1, %22 : index
%24 = arith.select %9, %23, %22 : index
%25 = arith.muli %24, %c-32 : index
%26 = arith.addi %21, %25 : index
%27 = arith.muli %thread_id_y, %c8 : index
%28 = arith.muli %thread_id_z, %c16 : index
%29 = arith.addi %27, %28 : index
%30 = arith.addi %29, %24 : index
%31 = arith.addi %17, %25 : index
%32 = arith.divsi %11, %c32 : index
%33 = arith.subi %c-1, %32 : index
%34 = arith.select %9, %33, %32 : index
%35 = arith.muli %34, %c16 : index
gpu.barrier {__pipelining_first_stage__}
%36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%39 = arith.addi %19, %c16 : index
%40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%41 = arith.addi %30, %c16 : index
%42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%44 = arith.addi %19, %c32 : index
%45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%46 = arith.addi %30, %c32 : index
%47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
gpu.barrier {__pipelining_first_stage__}
%49 = arith.addi %19, %c48 : index
%50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%51 = arith.addi %30, %c48 : index
%52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index): // 2 preds: ^bb0, ^bb2
%64 = arith.cmpi slt, %54, %c1024 : index
cf.cond_br %64, ^bb2, ^bb3
^bb2: // pred: ^bb1
%65 = arith.cmpi slt, %54, %c960 : index
nvgpu.device_async_wait %56 {numGroups = 3 : i32}
gpu.barrier
%66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
%68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
%70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
%71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.barrier {__pipelining_first_stage__}
%72 = arith.addi %54, %c64 : index
%73 = arith.addi %72, %17 : index
%74 = arith.addi %73, %18 : index
%75 = arith.cmpi slt, %72, %c0 : index
%76 = arith.subi %c-65, %54 : index
%77 = arith.select %75, %76, %72 : index
%78 = arith.divsi %77, %c16 : index
%79 = arith.subi %c-1, %78 : index
%80 = arith.select %75, %79, %78 : index
%81 = arith.remsi %80, %c4 : index
%82 = arith.cmpi slt, %81, %c0 : index
%83 = arith.addi %81, %c4 : index
%84 = arith.select %82, %83, %81 : index
%85 = arith.select %65, %c4, %c0 : index
%86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
%87 = arith.addi %72, %27 : index
%88 = arith.addi %87, %28 : index
%89 = arith.addi %88, %24 : index
%90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
%91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
%92 = arith.addi %54, %c16 : index
cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
^bb3: // pred: ^bb1
gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
gpu.barrier
%93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%94 = arith.addi %29, %7 : index
%95 = arith.addi %94, %24 : index
vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
%96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
%97 = arith.addi %95, %c16 : index
vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
gpu.barrier
return
}
}
// -----// IR Dump After ConvertToNVVMPass (iree-convert-to-nvvm) //----- //
module {
llvm.mlir.global external @__dynamic_shared_memory__() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
llvm.mlir.global private @__shared_memory___1() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<32 x array<36 x f32>>
llvm.mlir.global private @__shared_memory___0() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<4 x array<32 x array<20 x f32>>>
llvm.mlir.global private @__shared_memory__() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<4 x array<16 x array<36 x f32>>>
llvm.func @dot_dispatch_0(%arg0: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}, %arg1: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}, %arg2: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}) {
%0 = llvm.mlir.constant(16 : i32) : i32
%1 = llvm.mlir.constant(32 : i32) : i32
%2 = llvm.mlir.constant(3 : i32) : i32
%3 = llvm.mlir.constant(36 : index) : i32
%4 = llvm.mlir.constant(20 : index) : i32
%5 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%6 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.mlir.constant(0 : i64) : i64
%9 = llvm.getelementptr %6[%7, %8] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
%10 = llvm.mlir.constant(576 : index) : i64
%11 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
%12 = llvm.mlir.constant(0 : i64) : i64
%13 = llvm.mlir.constant(9216 : i64) : i64
%14 = llvm.getelementptr %11[%12, %13] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
%15 = llvm.mlir.constant(640 : index) : i64
%16 = llvm.mlir.constant(20 : index) : i64
%17 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
%18 = llvm.mlir.constant(0 : i64) : i64
%19 = llvm.mlir.constant(19456 : i64) : i64
%20 = llvm.getelementptr %17[%18, %19] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
%21 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%22 = llvm.mlir.constant(0 : index) : i64
%23 = llvm.mlir.constant(1024 : index) : i64
%24 = llvm.mlir.constant(16 : index) : i64
%25 = llvm.mlir.constant(8 : index) : i64
%26 = llvm.mlir.constant(4 : index) : i64
%27 = llvm.mlir.constant(1 : index) : i64
%28 = llvm.mlir.constant(2 : index) : i64
%29 = llvm.mlir.constant(3 : index) : i64
%30 = llvm.mlir.constant(960 : index) : i64
%31 = llvm.mlir.constant(64 : index) : i64
%32 = llvm.mlir.constant(32 : index) : i64
%33 = llvm.mlir.constant(-1 : index) : i64
%34 = llvm.mlir.constant(-16 : index) : i64
%35 = llvm.mlir.constant(-32 : index) : i64
%36 = llvm.mlir.constant(48 : index) : i64
%37 = llvm.mlir.constant(-65 : index) : i64
%38 = llvm.mlir.constant(36 : index) : i64
%39 = llvm.getelementptr %20[0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<32 x array<36 x f32>>
%40 = llvm.getelementptr %14[0, 0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x array<32 x array<20 x f32>>>
%41 = llvm.getelementptr %9[0, 0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x array<16 x array<36 x f32>>>
%42 = llvm.insertvalue %21, %5[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%43 = llvm.insertvalue %21, %42[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%44 = llvm.insertvalue %21, %43[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%45 = llvm.insertvalue %21, %44[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%46 = llvm.insertvalue %21, %45[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%47 = llvm.insertvalue %21, %46[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%48 = llvm.insertvalue %21, %47[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%49 = llvm.insertvalue %21, %48[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%50 = nvvm.read.ptx.sreg.tid.x : i32
%51 = llvm.sext %50 : i32 to i64
%52 = nvvm.read.ptx.sreg.tid.y : i32
%53 = llvm.sext %52 : i32 to i64
%54 = nvvm.read.ptx.sreg.tid.z : i32
%55 = llvm.sext %54 : i32 to i64
%56 = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64
%57 = llvm.and %56, %22 : i64
%58 = llvm.icmp "eq" %57, %22 : i64
llvm.intr.assume %58 : i1
%59 = llvm.ptrtoint %arg1 : !llvm.ptr<1> to i64
%60 = llvm.and %59, %22 : i64
%61 = llvm.icmp "eq" %60, %22 : i64
llvm.intr.assume %61 : i1
%62 = llvm.ptrtoint %arg2 : !llvm.ptr<1> to i64
%63 = llvm.and %62, %22 : i64
%64 = llvm.icmp "eq" %63, %22 : i64
llvm.intr.assume %64 : i1
%65 = nvvm.read.ptx.sreg.ctaid.x : i32
%66 = llvm.sext %65 : i32 to i64
%67 = nvvm.read.ptx.sreg.ctaid.y : i32
%68 = llvm.sext %67 : i32 to i64
%69 = llvm.mul %53, %24 : i64
%70 = llvm.mul %55, %32 : i64
%71 = llvm.add %69, %70 : i64
%72 = llvm.mul %68, %32 : i64
%73 = llvm.add %71, %72 : i64
%74 = llvm.icmp "slt" %51, %22 : i64
%75 = llvm.sub %33, %51 : i64
%76 = llvm.select %74, %75, %51 : i1, i64
%77 = llvm.sdiv %76, %26 : i64
%78 = llvm.sub %33, %77 : i64
%79 = llvm.select %74, %78, %77 : i1, i64
%80 = llvm.add %73, %79 : i64
%81 = llvm.add %71, %79 : i64
%82 = llvm.mul %51, %26 : i64
%83 = llvm.mul %79, %34 : i64
%84 = llvm.add %82, %83 : i64
%85 = llvm.mul %66, %32 : i64
%86 = llvm.add %82, %85 : i64
%87 = llvm.sdiv %76, %25 : i64
%88 = llvm.sub %33, %87 : i64
%89 = llvm.select %74, %88, %87 : i1, i64
%90 = llvm.mul %89, %35 : i64
%91 = llvm.add %86, %90 : i64
%92 = llvm.mul %53, %25 : i64
%93 = llvm.mul %55, %24 : i64
%94 = llvm.add %92, %93 : i64
%95 = llvm.add %94, %89 : i64
%96 = llvm.add %82, %90 : i64
%97 = llvm.sdiv %76, %32 : i64
%98 = llvm.sub %33, %97 : i64
%99 = llvm.select %74, %98, %97 : i1, i64
%100 = llvm.mul %99, %24 : i64
nvvm.barrier0
%101 = llvm.mul %22, %15 : i64
%102 = llvm.mul %81, %16 : i64
%103 = llvm.add %101, %102 : i64
%104 = llvm.add %103, %84 : i64
%105 = llvm.getelementptr %40[%104] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%106 = llvm.mul %80, %23 : i64
%107 = llvm.add %106, %84 : i64
%108 = llvm.getelementptr %arg0[%107] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %105, %108, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
%109 = llvm.mul %22, %10 : i64
%110 = llvm.mul %95, %38 : i64
%111 = llvm.add %109, %110 : i64
%112 = llvm.add %111, %96 : i64
%113 = llvm.getelementptr %41[%112] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%114 = llvm.mul %95, %23 : i64
%115 = llvm.add %114, %91 : i64
%116 = llvm.getelementptr %arg1[%115] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %113, %116, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
nvvm.cp.async.commit.group
nvvm.barrier0
%117 = llvm.add %84, %24 : i64
%118 = llvm.mul %27, %15 : i64
%119 = llvm.mul %81, %16 : i64
%120 = llvm.add %118, %119 : i64
%121 = llvm.add %120, %84 : i64
%122 = llvm.getelementptr %40[%121] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%123 = llvm.mul %80, %23 : i64
%124 = llvm.add %123, %117 : i64
%125 = llvm.getelementptr %arg0[%124] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %122, %125, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
%126 = llvm.add %95, %24 : i64
%127 = llvm.mul %27, %10 : i64
%128 = llvm.mul %95, %38 : i64
%129 = llvm.add %127, %128 : i64
%130 = llvm.add %129, %96 : i64
%131 = llvm.getelementptr %41[%130] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%132 = llvm.mul %126, %23 : i64
%133 = llvm.add %132, %91 : i64
%134 = llvm.getelementptr %arg1[%133] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %131, %134, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
nvvm.cp.async.commit.group
nvvm.barrier0
%135 = llvm.add %84, %32 : i64
%136 = llvm.mul %28, %15 : i64
%137 = llvm.mul %81, %16 : i64
%138 = llvm.add %136, %137 : i64
%139 = llvm.add %138, %84 : i64
%140 = llvm.getelementptr %40[%139] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%141 = llvm.mul %80, %23 : i64
%142 = llvm.add %141, %135 : i64
%143 = llvm.getelementptr %arg0[%142] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %140, %143, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
%144 = llvm.add %95, %32 : i64
%145 = llvm.mul %28, %10 : i64
%146 = llvm.mul %95, %38 : i64
%147 = llvm.add %145, %146 : i64
%148 = llvm.add %147, %96 : i64
%149 = llvm.getelementptr %41[%148] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%150 = llvm.mul %144, %23 : i64
%151 = llvm.add %150, %91 : i64
%152 = llvm.getelementptr %arg1[%151] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %149, %152, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
nvvm.cp.async.commit.group
nvvm.barrier0
%153 = llvm.add %84, %36 : i64
%154 = llvm.mul %29, %15 : i64
%155 = llvm.mul %81, %16 : i64
%156 = llvm.add %154, %155 : i64
%157 = llvm.add %156, %84 : i64
%158 = llvm.getelementptr %40[%157] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%159 = llvm.mul %80, %23 : i64
%160 = llvm.add %159, %153 : i64
%161 = llvm.getelementptr %arg0[%160] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %158, %161, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
%162 = llvm.add %95, %36 : i64
%163 = llvm.mul %29, %10 : i64
%164 = llvm.mul %95, %38 : i64
%165 = llvm.add %163, %164 : i64
%166 = llvm.add %165, %96 : i64
%167 = llvm.getelementptr %41[%166] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%168 = llvm.mul %162, %23 : i64
%169 = llvm.add %168, %91 : i64
%170 = llvm.getelementptr %arg1[%169] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
nvvm.cp.async.shared.global %167, %170, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
nvvm.cp.async.commit.group
llvm.br ^bb1(%22, %49, %22, %27, %28, %29 : i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, i64, i64, i64, i64)
^bb1(%171: i64, %172: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %173: i64, %174: i64, %175: i64, %176: i64): // 2 preds: ^bb0, ^bb2
%177 = llvm.icmp "slt" %171, %23 : i64
llvm.cond_br %177, ^bb2, ^bb3
^bb2: // pred: ^bb1
%178 = llvm.icmp "slt" %171, %30 : i64
nvvm.cp.async.wait.group 3
nvvm.barrier0
%179 = llvm.mul %173, %15 : i64
%180 = llvm.mul %69, %16 : i64
%181 = llvm.add %179, %180 : i64
%182 = llvm.add %181, %22 : i64
%183 = llvm.getelementptr %40[%182] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%184 = nvvm.wmma.load %183, %4 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
%185 = llvm.mul %173, %15 : i64
%186 = llvm.mul %69, %16 : i64
%187 = llvm.add %185, %186 : i64
%188 = llvm.add %187, %25 : i64
%189 = llvm.getelementptr %40[%188] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%190 = nvvm.wmma.load %189, %4 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
%191 = llvm.mul %173, %10 : i64
%192 = llvm.mul %22, %38 : i64
%193 = llvm.add %191, %192 : i64
%194 = llvm.add %193, %100 : i64
%195 = llvm.getelementptr %41[%194] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%196 = nvvm.wmma.load %195, %3 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<b>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
%197 = llvm.mul %173, %10 : i64
%198 = llvm.mul %25, %38 : i64
%199 = llvm.add %197, %198 : i64
%200 = llvm.add %199, %100 : i64
%201 = llvm.getelementptr %41[%200] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%202 = nvvm.wmma.load %201, %3 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<b>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
%203 = llvm.extractvalue %184[0] : !llvm.struct<(i32, i32, i32, i32)>
%204 = llvm.extractvalue %184[1] : !llvm.struct<(i32, i32, i32, i32)>
%205 = llvm.extractvalue %184[2] : !llvm.struct<(i32, i32, i32, i32)>
%206 = llvm.extractvalue %184[3] : !llvm.struct<(i32, i32, i32, i32)>
%207 = llvm.extractvalue %196[0] : !llvm.struct<(i32, i32, i32, i32)>
%208 = llvm.extractvalue %196[1] : !llvm.struct<(i32, i32, i32, i32)>
%209 = llvm.extractvalue %196[2] : !llvm.struct<(i32, i32, i32, i32)>
%210 = llvm.extractvalue %196[3] : !llvm.struct<(i32, i32, i32, i32)>
%211 = llvm.extractvalue %172[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%212 = llvm.extractvalue %172[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%213 = llvm.extractvalue %172[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%214 = llvm.extractvalue %172[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%215 = llvm.extractvalue %172[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%216 = llvm.extractvalue %172[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%217 = llvm.extractvalue %172[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%218 = llvm.extractvalue %172[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%219 = nvvm.wmma.mma %203, %204, %205, %206, %207, %208, %209, %210, %211, %212, %213, %214, %215, %216, %217, %218 {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%220 = llvm.extractvalue %190[0] : !llvm.struct<(i32, i32, i32, i32)>
%221 = llvm.extractvalue %190[1] : !llvm.struct<(i32, i32, i32, i32)>
%222 = llvm.extractvalue %190[2] : !llvm.struct<(i32, i32, i32, i32)>
%223 = llvm.extractvalue %190[3] : !llvm.struct<(i32, i32, i32, i32)>
%224 = llvm.extractvalue %202[0] : !llvm.struct<(i32, i32, i32, i32)>
%225 = llvm.extractvalue %202[1] : !llvm.struct<(i32, i32, i32, i32)>
%226 = llvm.extractvalue %202[2] : !llvm.struct<(i32, i32, i32, i32)>
%227 = llvm.extractvalue %202[3] : !llvm.struct<(i32, i32, i32, i32)>
%228 = llvm.extractvalue %219[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%229 = llvm.extractvalue %219[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%230 = llvm.extractvalue %219[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%231 = llvm.extractvalue %219[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%232 = llvm.extractvalue %219[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%233 = llvm.extractvalue %219[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%234 = llvm.extractvalue %219[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%235 = llvm.extractvalue %219[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%236 = nvvm.wmma.mma %220, %221, %222, %223, %224, %225, %226, %227, %228, %229, %230, %231, %232, %233, %234, %235 {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
nvvm.barrier0
%237 = llvm.add %171, %31 : i64
%238 = llvm.add %237, %82 : i64
%239 = llvm.add %238, %83 : i64
%240 = llvm.icmp "slt" %237, %22 : i64
%241 = llvm.sub %37, %171 : i64
%242 = llvm.select %240, %241, %237 : i1, i64
%243 = llvm.sdiv %242, %24 : i64
%244 = llvm.sub %33, %243 : i64
%245 = llvm.select %240, %244, %243 : i1, i64
%246 = llvm.srem %245, %26 : i64
%247 = llvm.icmp "slt" %246, %22 : i64
%248 = llvm.add %246, %26 : i64
%249 = llvm.select %247, %248, %246 : i1, i64
%250 = llvm.select %178, %26, %22 : i1, i64
%251 = llvm.mul %249, %15 : i64
%252 = llvm.mul %81, %16 : i64
%253 = llvm.add %251, %252 : i64
%254 = llvm.add %253, %84 : i64
%255 = llvm.getelementptr %40[%254] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%256 = llvm.mul %80, %23 : i64
%257 = llvm.add %256, %239 : i64
%258 = llvm.getelementptr %arg0[%257] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%259 = llvm.trunc %250 : i64 to i32
%260 = llvm.mul %259, %1 : i32
%261 = llvm.lshr %260, %2 : i32
llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %255, %258, %0, %261 : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
%262 = llvm.add %237, %92 : i64
%263 = llvm.add %262, %93 : i64
%264 = llvm.add %263, %89 : i64
%265 = llvm.mul %249, %10 : i64
%266 = llvm.mul %95, %38 : i64
%267 = llvm.add %265, %266 : i64
%268 = llvm.add %267, %96 : i64
%269 = llvm.getelementptr %41[%268] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%270 = llvm.mul %264, %23 : i64
%271 = llvm.add %270, %91 : i64
%272 = llvm.getelementptr %arg1[%271] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%273 = llvm.trunc %250 : i64 to i32
%274 = llvm.mul %273, %1 : i32
%275 = llvm.lshr %274, %2 : i32
llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %269, %272, %0, %275 : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
nvvm.cp.async.commit.group
%276 = llvm.add %171, %24 : i64
llvm.br ^bb1(%276, %236, %174, %175, %176, %249 : i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, i64, i64, i64, i64)
^bb3: // pred: ^bb1
%277 = llvm.extractvalue %172[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%278 = llvm.extractvalue %172[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%279 = llvm.extractvalue %172[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%280 = llvm.extractvalue %172[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%281 = llvm.extractvalue %172[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%282 = llvm.extractvalue %172[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%283 = llvm.extractvalue %172[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%284 = llvm.extractvalue %172[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
%285 = llvm.mul %69, %38 : i64
%286 = llvm.add %285, %100 : i64
%287 = llvm.getelementptr %39[%286] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
nvvm.wmma.store %287, %3, %277, %278, %279, %280, %281, %282, %283, %284 {eltype = #nvvm.mma_type<f32>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, f32, f32, f32, f32, f32, f32, f32, f32
nvvm.barrier0
%288 = llvm.mul %95, %38 : i64
%289 = llvm.add %288, %96 : i64
%290 = llvm.getelementptr %39[%289] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%291 = llvm.load %290 {alignment = 4 : i64} : !llvm.ptr<3> -> vector<4xf32>
%292 = llvm.add %94, %72 : i64
%293 = llvm.add %292, %89 : i64
%294 = llvm.mul %293, %23 : i64
%295 = llvm.add %294, %91 : i64
%296 = llvm.getelementptr %arg2[%295] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %291, %296 {alignment = 4 : i64} : vector<4xf32>, !llvm.ptr<1>
%297 = llvm.mul %126, %38 : i64
%298 = llvm.add %297, %96 : i64
%299 = llvm.getelementptr %39[%298] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
%300 = llvm.load %299 {alignment = 4 : i64} : !llvm.ptr<3> -> vector<4xf32>
%301 = llvm.add %293, %24 : i64
%302 = llvm.mul %301, %23 : i64
%303 = llvm.add %302, %91 : i64
%304 = llvm.getelementptr %arg2[%303] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %300, %304 {alignment = 4 : i64} : vector<4xf32>, !llvm.ptr<1>
nvvm.barrier0
llvm.return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment