Created
          September 20, 2021 18:00 
        
      - 
      
- 
        Save bjacob/8c7f6ea08a15b52af5b3898dffbbb780 to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- // | |
| #map0 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map3 = affine_map<(d0, d1) -> (d0, d1)> | |
| module { | |
| func private @actual(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} { | |
| %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| return %0 : tensor<?x?xf32> | |
| } | |
| func private @expected(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} { | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors | |
| %1 = mulf %arg3, %arg4 : f32 | |
| %2 = addf %1, %arg5 : f32 | |
| linalg.yield %2 : f32 | |
| } -> tensor<?x?xf32> | |
| return %0 : tensor<?x?xf32> | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c10_0 = constant 10 : index | |
| %0 = linalg.init_tensor [%c10, %c10_0] : tensor<?x?xf32> | |
| %1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<?x?xf32>) outs(%0 : tensor<?x?xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_5 = constant 1.000000e+00 : f32 | |
| %11 = select %10, %cst, %cst_5 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<?x?xf32> | |
| %c10_1 = constant 10 : index | |
| %c10_2 = constant 10 : index | |
| %2 = linalg.init_tensor [%c10_1, %c10_2] : tensor<?x?xf32> | |
| %3 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_5 = constant 1.000000e+00 : f32 | |
| %11 = select %10, %cst, %cst_5 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<?x?xf32> | |
| %c10_3 = constant 10 : index | |
| %c10_4 = constant 10 : index | |
| %4 = linalg.init_tensor [%c10_3, %c10_4] : tensor<?x?xf32> | |
| %5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_5 = constant 1.000000e+00 : f32 | |
| %11 = select %10, %cst, %cst_5 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<?x?xf32> | |
| %6 = call @actual(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| check.expect_eq(%6, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func private @actual(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} { | |
| %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| return %0 : tensor<?x?xf32> | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func private @expected(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} { | |
| %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors | |
| %1 = mulf %arg3, %arg4 : f32 | |
| %2 = addf %1, %arg5 : f32 | |
| linalg.yield %2 : f32 | |
| } -> tensor<?x?xf32> | |
| return %0 : tensor<?x?xf32> | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = linalg.index 1 : index | |
| %13 = cmpi eq, %11, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = tensor.cast %1 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %3 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3 : tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = linalg.index 1 : index | |
| %13 = cmpi eq, %11, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<10x10xf32>) outs(%6 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = linalg.index 1 : index | |
| %13 = cmpi eq, %11, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } -> tensor<10x10xf32> | |
| %8 = tensor.cast %7 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %9 = call @actual(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %10 = call @expected(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| check.expect_eq(%9, %10) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %10 = mulf %arg0, %arg1 : f32 | |
| %11 = addf %10, %arg2 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%7, %9) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Inliner //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %5 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst, %cst_0 : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %8 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %10 = mulf %arg0, %arg1 : f32 | |
| %11 = addf %10, %arg2 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%7, %9) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst_0, %cst : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst_0, %cst : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = linalg.index 1 : index | |
| %12 = cmpi eq, %10, %11 : index | |
| %13 = select %12, %cst_0, %cst : f32 | |
| linalg.yield %13 : f32 | |
| } -> tensor<10x10xf32> | |
| %6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %10 = mulf %arg0, %arg1 : f32 | |
| %11 = addf %10, %arg2 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%7, %9) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After SymbolDCE //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IREEImportPublic //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyInputLegality //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::Util::{anonymous}::SimplifyGlobalAccessesPass //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After ExpandGlobalDynamicDims //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::ExpandFunctionDynamicDimsPass //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After PadTensorToSubTensorInsert //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertElementwiseToLinalg //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After LinalgFoldUnitExtentDims //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After InterchangeGenericOps //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After FusionOfTensorOps //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
| #map0 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32> | |
| %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32> | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = flow.tensor.reshape %6 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32): // no predecessors | |
| %8 = linalg.index 0 : index | |
| %9 = linalg.index 1 : index | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| linalg.yield %11 : f32 | |
| } -> tensor<10x10xf32> | |
| %4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) { | |
| ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors | |
| %8 = mulf %arg0, %arg1 : f32 | |
| %9 = addf %8, %arg2 : f32 | |
| linalg.yield %9 : f32 | |
| } -> tensor<10x10xf32> | |
| %7 = flow.tensor.reshape %6 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%5, %7) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After DispatchLinalgOnTensors //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %25 = mulf %arg5, %arg6 : f32 | |
| %26 = addf %25, %arg7 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %12 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %15 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %12 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %15 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %16 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %17 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %18 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %21 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply #map3(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply #map3(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %16 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %17 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %18 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %21 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply #map3(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply #map3(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %25 = mulf %arg5, %arg6 : f32 | |
| %26 = addf %25, %arg7 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10_1 = constant 10 : index | |
| %6 = linalg.init_tensor [10, 10] : tensor<10x10xf32> | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %7 to %c10_1 step %8 { | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %9 to %c10_1 step %10 { | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %25 = linalg.index 0 : index | |
| %26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3) | |
| %27 = linalg.index 1 : index | |
| %28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4) | |
| %29 = cmpi eq, %26, %28 : index | |
| %30 = select %29, %cst_0, %cst : f32 | |
| linalg.yield %30 : f32 | |
| } -> tensor<?x?xf32> | |
| %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %25 = mulf %arg5, %arg6 : f32 | |
| %26 = addf %25, %arg7 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10_0 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_1 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %6 to %c10_0 step %7 { | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %8 to %c10_0 step %9 { | |
| %10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %17 = linalg.index 0 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = linalg.index 1 : index | |
| %20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg2) | |
| %21 = cmpi eq, %18, %20 : index | |
| %22 = select %21, %cst, %cst_1 : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %16, %arg0, offsets = [%arg1, %arg2], sizes = [%13, %14], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10_0 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_1 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %6 to %c10_0 step %7 { | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %8 to %c10_0 step %9 { | |
| %10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1) | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %17 = linalg.index 0 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = linalg.index 1 : index | |
| %20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg2) | |
| %21 = cmpi eq, %18, %20 : index | |
| %22 = select %21, %cst, %cst_1 : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %16, %arg0, offsets = [%arg1, %arg2], sizes = [%13, %14], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10_0 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_1 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %6 to %c10_0 step %7 { | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %8 to %c10_0 step %9 { | |
| %10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %11 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %13 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %16 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %21 = linalg.init_tensor [%19, %20] : tensor<?x?xf32> | |
| %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<?x?xf32>) outs(%21 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %24 = linalg.index 0 : index | |
| %25 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%24, %arg3) | |
| %26 = linalg.index 1 : index | |
| %27 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%26, %arg4) | |
| %28 = cmpi eq, %25, %27 : index | |
| %29 = select %28, %cst, %cst_1 : f32 | |
| linalg.yield %29 : f32 | |
| } -> tensor<?x?xf32> | |
| %23 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %13 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%22 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %23, %arg2, offsets = [%arg3, %arg4], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10_0 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_1 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %6 to %c10_0 step %7 { | |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
| %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %8 to %c10_0 step %9 { | |
| %10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %11 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %13 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3) | |
| %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4) | |
| %16 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1) | |
| %20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0) | |
| %21 = linalg.init_tensor [%19, %20] : tensor<?x?xf32> | |
| %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<?x?xf32>) outs(%21 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %24 = linalg.index 0 : index | |
| %25 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%24, %arg3) | |
| %26 = linalg.index 1 : index | |
| %27 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%26, %arg4) | |
| %28 = cmpi eq, %25, %27 : index | |
| %29 = select %28, %cst, %cst_1 : f32 | |
| linalg.yield %29 : f32 | |
| } -> tensor<?x?xf32> | |
| %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %13 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%22 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %24 = mulf %arg5, %arg6 : f32 | |
| %25 = addf %24, %arg7 : f32 | |
| linalg.yield %25 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %23, %arg2, offsets = [%arg3, %arg4], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| flow.return | |
| } | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After OutlineDispatchRegions //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %5 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %8 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_1 { | |
| flow.dispatch.entry public @matmul_test_dispatch_1 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_1(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %5 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %8 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_1::@matmul_test_dispatch_1[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_1::@matmul_test_dispatch_1[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After DeduplicateExecutables //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %5 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %8 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%3, %5) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%2, %4) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After HoistUnstreamableOps //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%2, %4) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%2, %4) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%2, %4) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After InsertConstantClones //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32> | |
| %1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| %3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10} | |
| check.expect_eq(%2, %4) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After FormStreams //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]> | |
| %1 = shapex.ranked_dim %0[0] : !shapex.ranked_shape<[?,?]> -> index | |
| %2 = shapex.ranked_dim %0[1] : !shapex.ranked_shape<[?,?]> -> index | |
| %3 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]> | |
| %4 = shapex.ranked_dim %3[0] : !shapex.ranked_shape<[?,?]> -> index | |
| %5 = shapex.ranked_dim %3[1] : !shapex.ranked_shape<[?,?]> -> index | |
| %6:2 = flow.ex.stream.fragment(%c10, %c1) : (index, index) -> (tensor<?x?xf32>{%1, %2}, tensor<?x?xf32>{%4, %5}) = | |
| (%arg0: index, %arg1: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %7 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%arg0, %arg0, %arg1]() : () -> tensor<10x10xf32> | |
| %8 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %9 = flow.tensor.reshape %8 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0} | |
| %10 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %11 = flow.tensor.reshape %10 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0} | |
| flow.return %9, %11 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%6#0, %6#1) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After OutlineLargeConstants //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %5 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %8 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c1 = constant 1 : index | |
| %c10 = constant 10 : index | |
| %0 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]> | |
| %1 = shapex.ranked_dim %0[0] : !shapex.ranked_shape<[?,?]> -> index | |
| %2 = shapex.ranked_dim %0[1] : !shapex.ranked_shape<[?,?]> -> index | |
| %3 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]> | |
| %4 = shapex.ranked_dim %3[0] : !shapex.ranked_shape<[?,?]> -> index | |
| %5 = shapex.ranked_dim %3[1] : !shapex.ranked_shape<[?,?]> -> index | |
| %6:2 = flow.ex.stream.fragment(%c10, %c1) : (index, index) -> (tensor<?x?xf32>{%1, %2}, tensor<?x?xf32>{%4, %5}) = | |
| (%arg0: index, %arg1: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %7 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%arg0, %arg0, %arg1]() : () -> tensor<10x10xf32> | |
| %8 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %9 = flow.tensor.reshape %8 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0} | |
| %10 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %11 = flow.tensor.reshape %10 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0} | |
| flow.return %9, %11 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%6#0, %6#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| // -----// IR Dump After SymbolDCE //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %5 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%workgroup_size_1, %arg1) | |
| %8 = affine.min #map1(%workgroup_size_0, %arg2) | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%workgroup_size_1, %arg3) | |
| %9 = affine.min #map1(%workgroup_size_0, %arg4) | |
| %10 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %11 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3, %workgroup_size_1) | |
| %14 = affine.min #map4(%arg4, %workgroup_size_0) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst_0, %cst : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst_0, %cst : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst_0, %cst : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst_0, %cst : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst_0, %cst : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst_0, %cst : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SymbolDCE //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| flow.executable private @matmul_test_dispatch_0 { | |
| flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg1 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg2 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %5 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32> | |
| %7 = affine.min #map1(%arg1)[%workgroup_size_1] | |
| %8 = affine.min #map1(%arg2)[%workgroup_size_0] | |
| %9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32> | |
| %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors | |
| %11 = linalg.index 0 : index | |
| %12 = affine.apply #map3(%11, %arg1) | |
| %13 = linalg.index 1 : index | |
| %14 = affine.apply #map3(%13, %arg2) | |
| %15 = cmpi eq, %12, %14 : index | |
| %16 = select %15, %cst, %cst_0 : f32 | |
| linalg.yield %16 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_2 { | |
| flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| flow.executable private @matmul_test_dispatch_3 { | |
| flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) { | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
| %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
| %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
| %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
| %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
| %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
| %0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1] | |
| %1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1] | |
| scf.for %arg3 = %0 to %c10 step %1 { | |
| %2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0] | |
| %3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0] | |
| scf.for %arg4 = %2 to %c10 step %3 { | |
| %4 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %6 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %8 = affine.min #map1(%arg3)[%workgroup_size_1] | |
| %9 = affine.min #map1(%arg4)[%workgroup_size_0] | |
| %10 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %11 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32> | |
| %13 = affine.min #map4(%arg3)[%workgroup_size_1] | |
| %14 = affine.min #map4(%arg4)[%workgroup_size_0] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) { | |
| ^bb0(%arg5: f32, %arg6: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply #map3(%18, %arg3) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply #map3(%20, %arg4) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } -> tensor<?x?xf32> | |
| %17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors | |
| %18 = mulf %arg5, %arg6 : f32 | |
| %19 = addf %18, %arg7 : f32 | |
| linalg.yield %19 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| hal.executable private @matmul_test_dispatch_0 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %2 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %4 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %6 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply #map3(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply #map3(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst, %cst_0 : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| hal.executable private @matmul_test_dispatch_2 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %12 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %13 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %14 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %17 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply #map3(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply #map3(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst, %cst_0 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| hal.executable private @matmul_test_dispatch_3 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %12 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %13 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %14 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %17 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply #map3(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply #map3(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst, %cst_0 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %21 = mulf %arg2, %arg3 : f32 | |
| %22 = addf %21, %arg4 : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c10_0 = constant 10 : index | |
| %c1 = constant 1 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() {hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>]} : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}> | |
| #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> | |
| #map0 = affine_map<()[s0, s1] -> (s0 * s1)> | |
| #map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)> | |
| #map2 = affine_map<(d0, d1) -> (d0, d1)> | |
| #map3 = affine_map<(d0, d1) -> (d0 + d1)> | |
| #map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)> | |
| #map5 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
| #map6 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
| #map7 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
| module attributes {hal.device.targets = [#device_target_cpu]} { | |
| hal.executable private @matmul_test_dispatch_0 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %2 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %4 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %6 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply #map3(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply #map3(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| hal.executable private @matmul_test_dispatch_2 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %12 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %13 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %14 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %17 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply #map3(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply #map3(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| hal.executable private @matmul_test_dispatch_3 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ { | |
| hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min #map1(%arg0)[%workgroup_size_y] | |
| %12 = affine.min #map1(%arg1)[%workgroup_size_x] | |
| %13 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %14 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min #map4(%arg0)[%workgroup_size_y] | |
| %17 = affine.min #map4(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply #map3(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply #map3(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %21 = mulf %arg2, %arg3 : f32 | |
| %22 = addf %21, %arg4 : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| } | |
| func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} { | |
| %c10 = constant 10 : index | |
| %0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) = | |
| () -> (tensor<?x?xf32>, tensor<?x?xf32>) { | |
| %c1 = constant 1 : index | |
| %c10_0 = constant 10 : index | |
| %1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() {hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>]} : () -> tensor<10x10xf32> | |
| %2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| %4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0} | |
| flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32> | |
| } | |
| check.expect_eq(%0#0, %0#1) : tensor<?x?xf32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
| %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %6 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y] | |
| %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y] | |
| %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index} | |
| builtin.module { | |
| func @matmul_test_dispatch_3() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index | |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y] | |
| %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x] | |
| %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y] | |
| %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x] | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y] | |
| %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x] | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %21 = mulf %arg2, %arg3 : f32 | |
| %22 = addf %21, %arg4 : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After LLVMCPULowerExecutableTarget Failed //----- // | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass Failed //----- // | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| // -----// IR Dump After SetNumWorkgroups //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c64 = constant 64 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64] | |
| %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64] | |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After SetNumWorkgroups //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c64 = constant 64 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64] | |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64] | |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass Failed //----- // | |
| "hal.executable"() ( { | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io"} : () -> () | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> () | |
| // -----// IR Dump After Canonicalizer //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst, %cst_0 : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst, %cst_0 : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| module { | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32> | |
| %8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32> | |
| %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| linalg.yield %17 : f32 | |
| } -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| module { | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32> | |
| %16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32> | |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %21 = linalg.index 0 : index | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0) | |
| %23 = linalg.index 1 : index | |
| %24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1) | |
| %25 = cmpi eq, %22, %24 : index | |
| %26 = select %25, %cst_0, %cst : f32 | |
| linalg.yield %26 : f32 | |
| } -> tensor<?x?xf32> | |
| %20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After LinalgBufferize //----- // | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %2 to %c10 step %3 { | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %4 to %c10 step %5 { | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %8 = linalg.init_tensor [%6, %7] : tensor<?x?xf32> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32> | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %14 = memref.subview %0[%arg0, %arg1] [%12, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = memref.alloca(%6, %7) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<?x?xf32>) outs(%14 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %17 = linalg.index 0 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg0) | |
| %19 = linalg.index 1 : index | |
| %20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg1) | |
| %21 = cmpi eq, %18, %20 : index | |
| %22 = select %21, %cst_0, %cst : f32 | |
| linalg.yield %22 : f32 | |
| } | |
| %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %17 = linalg.index 0 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg0) | |
| %19 = linalg.index 1 : index | |
| %20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg1) | |
| %21 = cmpi eq, %18, %20 : index | |
| %22 = select %21, %cst_0, %cst : f32 | |
| linalg.yield %22 : f32 | |
| } -> tensor<?x?xf32> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LinalgBufferize //----- // | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %6 to %c10 step %7 { | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %8 to %c10 step %9 { | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %12 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32> | |
| %13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %14 = memref.subview %2[0, %arg1] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = flow.dispatch.tensor.load %3, offsets = [0, %arg1], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32> | |
| %16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %18 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %19 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %20 = linalg.init_tensor [%18, %19] : tensor<?x?xf32> | |
| %21 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %22 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %23 = linalg.init_tensor [%21, %22] : tensor<?x?xf32> | |
| %24 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %25 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %26 = memref.subview %4[%arg0, %arg1] [%24, %25] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %27 = memref.alloca(%18, %19) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%27 : memref<?x?xf32>) outs(%26 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %30 = linalg.index 0 : index | |
| %31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%30, %arg0) | |
| %32 = linalg.index 1 : index | |
| %33 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%32, %arg1) | |
| %34 = cmpi eq, %31, %33 : index | |
| %35 = select %34, %cst_0, %cst : f32 | |
| linalg.yield %35 : f32 | |
| } | |
| %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<?x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %30 = linalg.index 0 : index | |
| %31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%30, %arg0) | |
| %32 = linalg.index 1 : index | |
| %33 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%32, %arg1) | |
| %34 = cmpi eq, %31, %33 : index | |
| %35 = select %34, %cst_0, %cst : f32 | |
| linalg.yield %35 : f32 | |
| } -> tensor<?x?xf32> | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %14 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%26 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| %29 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%12, %15 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%28 : tensor<?x?xf32>) -> tensor<?x?xf32> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
| module { | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %2 to %c10 step %3 { | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %4 to %c10 step %5 { | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %0[%arg0, %arg1] [%8, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = memref.alloca(%6, %7) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<?x?xf32>) outs(%10 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| linalg.yield %17 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
| module { | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %6 to %c10 step %7 { | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %8 to %c10 step %9 { | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %18 = memref.subview %4[%arg0, %arg1] [%16, %17] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.alloca(%14, %15) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<?x?xf32>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %20 = linalg.index 0 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg0) | |
| %22 = linalg.index 1 : index | |
| %23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%22, %arg1) | |
| %24 = cmpi eq, %21, %23 : index | |
| %25 = select %24, %cst_0, %cst : f32 | |
| linalg.yield %25 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %2 to %c10 step %3 { | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %4 to %c10 step %5 { | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %0[%arg0, %arg1] [%8, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = memref.alloca(%6, %7) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<?x?xf32>) outs(%10 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %12 = linalg.index 0 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0) | |
| %14 = linalg.index 1 : index | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1) | |
| %16 = cmpi eq, %13, %15 : index | |
| %17 = select %16, %cst, %cst_0 : f32 | |
| linalg.yield %17 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %6 to %c10 step %7 { | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %8 to %c10 step %9 { | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %18 = memref.subview %4[%arg0, %arg1] [%16, %17] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.alloca(%14, %15) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<?x?xf32>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %20 = linalg.index 0 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg0) | |
| %22 = linalg.index 1 : index | |
| %23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%22, %arg1) | |
| %24 = cmpi eq, %21, %23 : index | |
| %25 = select %24, %cst, %cst_0 : f32 | |
| linalg.yield %25 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %2 to %c10 step %3 { | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %4 to %c10 step %5 { | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %8 = memref.subview %0[%arg0, %arg1] [%6, %7] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = memref.alloca(%6, %7) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : memref<?x?xf32>) outs(%8 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %10 = linalg.index 0 : index | |
| %11 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%10, %arg0) | |
| %12 = linalg.index 1 : index | |
| %13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg1) | |
| %14 = cmpi eq, %11, %13 : index | |
| %15 = select %14, %cst, %cst_0 : f32 | |
| linalg.yield %15 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32> | |
| %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %6 to %c10 step %7 { | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %8 to %c10 step %9 { | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %16 = memref.subview %4[%arg0, %arg1] [%10, %12] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = memref.alloca(%14, %15) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%17 : memref<?x?xf32>) outs(%16 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %18 = linalg.index 0 : index | |
| %19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg0) | |
| %20 = linalg.index 1 : index | |
| %21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg1) | |
| %22 = cmpi eq, %19, %21 : index | |
| %23 = select %22, %cst, %cst_0 : f32 | |
| linalg.yield %23 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%16 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CleanupBufferAllocView //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CleanupBufferAllocView //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst_0, %cst : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LLVMCPUVectorization //----- // | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_0() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst_0, %cst : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LLVMCPUVectorization //----- // | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c32 = constant 32 : index | |
| %c4 = constant 4 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst_0, %cst : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %20 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %21 = memref.subview %13[%arg2, %arg3] [%19, %20] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %23 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %24 = memref.subview %16[%arg4, %arg6] [%22, %23] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %26 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %27 = memref.subview %18[%arg6, %arg5] [%25, %26] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %28 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %29 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %30 = memref.subview %21[%arg4, %arg5] [%28, %29] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%24, %27 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%30 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_2() { | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c32 = constant 32 : index | |
| %c4 = constant 4 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst_0, %cst : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ForOpCanonicalization //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LLVMCPUPlanConvLoopOrder //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ForOpCanonicalization //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LLVMCPUPlanConvLoopOrder //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LLVMCPULowerExecutableTarget //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After LLVMCPULowerExecutableTarget //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| } | |
| // -----// IR Dump After LinalgExtToLoops //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %8 = memref.alloca(%5, %6) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %9 = linalg.index 0 : index | |
| %10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0) | |
| %11 = linalg.index 1 : index | |
| %12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1) | |
| %13 = cmpi eq, %10, %12 : index | |
| %14 = select %13, %cst, %cst_0 : f32 | |
| linalg.yield %14 : f32 | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LinalgExtToLoops //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %14 = memref.alloca(%11, %12) : memref<?x?xf32> | |
| linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %15 = linalg.index 0 : index | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0) | |
| %17 = linalg.index 1 : index | |
| %18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1) | |
| %19 = cmpi eq, %16, %18 : index | |
| %20 = select %19, %cst, %cst_0 : f32 | |
| linalg.yield %20 : f32 | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %15 step %c4 { | |
| scf.for %arg5 = %c0 to %17 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4) | |
| %21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5) | |
| %24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LinalgLowerToLoops //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %5 step %c1 { | |
| scf.for %arg3 = %c0 to %6 step %c1 { | |
| %8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After LinalgLowerToLoops //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %11 step %c1 { | |
| scf.for %arg3 = %c0 to %12 step %c1 { | |
| %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %16 = cmpi eq, %14, %15 : index | |
| %17 = select %16, %cst, %cst_0 : f32 | |
| memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %14 step %c4 { | |
| scf.for %arg5 = %c0 to %16 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4) | |
| %20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5) | |
| %23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg7 = %c0 to %19 step %c1 { | |
| scf.for %arg8 = %c0 to %22 step %c1 { | |
| scf.for %arg9 = %c0 to %20 step %c1 { | |
| %25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %28 = mulf %25, %26 : f32 | |
| %29 = addf %27, %28 : f32 | |
| memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::FoldDimOverShapeCarryingOpPass //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %5 step %c1 { | |
| scf.for %arg3 = %c0 to %6 step %c1 { | |
| %8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst, %cst_0 : f32 | |
| memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::FoldDimOverShapeCarryingOpPass //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %11 step %c1 { | |
| scf.for %arg3 = %c0 to %12 step %c1 { | |
| %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %16 = cmpi eq, %14, %15 : index | |
| %17 = select %16, %cst, %cst_0 : f32 | |
| memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %14 step %c4 { | |
| scf.for %arg5 = %c0 to %16 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4) | |
| %20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5) | |
| %23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg7 = %c0 to %19 step %c1 { | |
| scf.for %arg8 = %c0 to %22 step %c1 { | |
| scf.for %arg9 = %c0 to %20 step %c1 { | |
| %25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %28 = mulf %25, %26 : f32 | |
| %29 = addf %27, %28 : f32 | |
| memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %5 step %c1 { | |
| scf.for %arg3 = %c0 to %6 step %c1 { | |
| %8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c32 = constant 32 : index | |
| %c4 = constant 4 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %11 step %c1 { | |
| scf.for %arg3 = %c0 to %12 step %c1 { | |
| %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %16 = cmpi eq, %14, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %14 step %c4 { | |
| scf.for %arg5 = %c0 to %16 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4) | |
| %20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5) | |
| %23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg7 = %c0 to %19 step %c1 { | |
| scf.for %arg8 = %c0 to %22 step %c1 { | |
| scf.for %arg9 = %c0 to %20 step %c1 { | |
| %25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %28 = mulf %25, %26 : f32 | |
| %29 = addf %27, %28 : f32 | |
| memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %1 to %c10 step %2 { | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %3 to %c10 step %4 { | |
| %5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %5 step %c1 { | |
| scf.for %arg3 = %c0 to %6 step %c1 { | |
| %8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %10 = cmpi eq, %8, %9 : index | |
| %11 = select %10, %cst_0, %cst : f32 | |
| memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c32 = constant 32 : index | |
| %c4 = constant 4 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c10 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c10 step %6 { | |
| %7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0) | |
| %8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1) | |
| %10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0) | |
| %12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1) | |
| %13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg2 = %c0 to %11 step %c1 { | |
| scf.for %arg3 = %c0 to %12 step %c1 { | |
| %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0) | |
| %15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1) | |
| %16 = cmpi eq, %14, %15 : index | |
| %17 = select %16, %cst_0, %cst : f32 | |
| memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| scf.for %arg2 = %c0 to %7 step %c32 { | |
| scf.for %arg3 = %c0 to %9 step %c32 { | |
| %14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2) | |
| %15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3) | |
| %17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg4 = %c0 to %14 step %c4 { | |
| scf.for %arg5 = %c0 to %16 step %c4 { | |
| scf.for %arg6 = %c0 to %c10 step %c4 { | |
| %19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4) | |
| %20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6) | |
| %21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5) | |
| %23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| scf.for %arg7 = %c0 to %19 step %c1 { | |
| scf.for %arg8 = %c0 to %22 step %c1 { | |
| scf.for %arg9 = %c0 to %20 step %c1 { | |
| %25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %28 = mulf %25, %26 : f32 | |
| %29 = addf %27, %28 : f32 | |
| memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After SCFToStandard //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%1 : index) | |
| ^bb1(%3: index): // 2 preds: ^bb0, ^bb11 | |
| %4 = cmpi slt, %3, %c10 : index | |
| cond_br %4, ^bb2, ^bb12 | |
| ^bb2: // pred: ^bb1 | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%5 : index) | |
| ^bb3(%7: index): // 2 preds: ^bb2, ^bb10 | |
| %8 = cmpi slt, %7, %c10 : index | |
| cond_br %8, ^bb4, ^bb11 | |
| ^bb4: // pred: ^bb3 | |
| %9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%3) | |
| %10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%7) | |
| %11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%12: index): // 2 preds: ^bb4, ^bb9 | |
| %13 = cmpi slt, %12, %9 : index | |
| cond_br %13, ^bb6, ^bb10 | |
| ^bb6: // pred: ^bb5 | |
| br ^bb7(%c0 : index) | |
| ^bb7(%14: index): // 2 preds: ^bb6, ^bb8 | |
| %15 = cmpi slt, %14, %10 : index | |
| cond_br %15, ^bb8, ^bb9 | |
| ^bb8: // pred: ^bb7 | |
| %16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %3) | |
| %17 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %7) | |
| %18 = cmpi eq, %16, %17 : index | |
| %19 = select %18, %cst_0, %cst : f32 | |
| memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %20 = addi %14, %c1 : index | |
| br ^bb7(%20 : index) | |
| ^bb9: // pred: ^bb7 | |
| %21 = addi %12, %c1 : index | |
| br ^bb5(%21 : index) | |
| ^bb10: // pred: ^bb5 | |
| %22 = addi %7, %6 : index | |
| br ^bb3(%22 : index) | |
| ^bb11: // pred: ^bb3 | |
| %23 = addi %3, %2 : index | |
| br ^bb1(%23 : index) | |
| ^bb12: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After SCFToStandard //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c1 = constant 1 : index | |
| %cst = constant 1.000000e+00 : f32 | |
| %cst_0 = constant 0.000000e+00 : f32 | |
| %c10 = constant 10 : index | |
| %c0 = constant 0 : index | |
| %c32 = constant 32 : index | |
| %c4 = constant 4 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%3 : index) | |
| ^bb1(%5: index): // 2 preds: ^bb0, ^bb35 | |
| %6 = cmpi slt, %5, %c10 : index | |
| cond_br %6, ^bb2, ^bb36 | |
| ^bb2: // pred: ^bb1 | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%7 : index) | |
| ^bb3(%9: index): // 2 preds: ^bb2, ^bb34 | |
| %10 = cmpi slt, %9, %c10 : index | |
| cond_br %10, ^bb4, ^bb35 | |
| ^bb4: // pred: ^bb3 | |
| %11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%5) | |
| %12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%9) | |
| %14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%5) | |
| %16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%9) | |
| %17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%18: index): // 2 preds: ^bb4, ^bb9 | |
| %19 = cmpi slt, %18, %15 : index | |
| cond_br %19, ^bb6, ^bb10 | |
| ^bb6: // pred: ^bb5 | |
| br ^bb7(%c0 : index) | |
| ^bb7(%20: index): // 2 preds: ^bb6, ^bb8 | |
| %21 = cmpi slt, %20, %16 : index | |
| cond_br %21, ^bb8, ^bb9 | |
| ^bb8: // pred: ^bb7 | |
| %22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %5) | |
| %23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %9) | |
| %24 = cmpi eq, %22, %23 : index | |
| %25 = select %24, %cst_0, %cst : f32 | |
| memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = addi %20, %c1 : index | |
| br ^bb7(%26 : index) | |
| ^bb9: // pred: ^bb7 | |
| %27 = addi %18, %c1 : index | |
| br ^bb5(%27 : index) | |
| ^bb10: // pred: ^bb5 | |
| br ^bb11(%c0 : index) | |
| ^bb11(%28: index): // 2 preds: ^bb10, ^bb33 | |
| %29 = cmpi slt, %28, %11 : index | |
| cond_br %29, ^bb12, ^bb34 | |
| ^bb12: // pred: ^bb11 | |
| br ^bb13(%c0 : index) | |
| ^bb13(%30: index): // 2 preds: ^bb12, ^bb32 | |
| %31 = cmpi slt, %30, %13 : index | |
| cond_br %31, ^bb14, ^bb33 | |
| ^bb14: // pred: ^bb13 | |
| %32 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%11, %28) | |
| %33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %34 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%13, %30) | |
| %35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb15(%c0 : index) | |
| ^bb15(%37: index): // 2 preds: ^bb14, ^bb31 | |
| %38 = cmpi slt, %37, %32 : index | |
| cond_br %38, ^bb16, ^bb32 | |
| ^bb16: // pred: ^bb15 | |
| br ^bb17(%c0 : index) | |
| ^bb17(%39: index): // 2 preds: ^bb16, ^bb30 | |
| %40 = cmpi slt, %39, %34 : index | |
| cond_br %40, ^bb18, ^bb31 | |
| ^bb18: // pred: ^bb17 | |
| br ^bb19(%c0 : index) | |
| ^bb19(%41: index): // 2 preds: ^bb18, ^bb29 | |
| %42 = cmpi slt, %41, %c10 : index | |
| cond_br %42, ^bb20, ^bb30 | |
| ^bb20: // pred: ^bb19 | |
| %43 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%32, %37) | |
| %44 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%41) | |
| %45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %46 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%34, %39) | |
| %47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb21(%c0 : index) | |
| ^bb21(%49: index): // 2 preds: ^bb20, ^bb28 | |
| %50 = cmpi slt, %49, %43 : index | |
| cond_br %50, ^bb22, ^bb29 | |
| ^bb22: // pred: ^bb21 | |
| br ^bb23(%c0 : index) | |
| ^bb23(%51: index): // 2 preds: ^bb22, ^bb27 | |
| %52 = cmpi slt, %51, %46 : index | |
| cond_br %52, ^bb24, ^bb28 | |
| ^bb24: // pred: ^bb23 | |
| br ^bb25(%c0 : index) | |
| ^bb25(%53: index): // 2 preds: ^bb24, ^bb26 | |
| %54 = cmpi slt, %53, %44 : index | |
| cond_br %54, ^bb26, ^bb27 | |
| ^bb26: // pred: ^bb25 | |
| %55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %58 = mulf %55, %56 : f32 | |
| %59 = addf %57, %58 : f32 | |
| memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %60 = addi %53, %c1 : index | |
| br ^bb25(%60 : index) | |
| ^bb27: // pred: ^bb25 | |
| %61 = addi %51, %c1 : index | |
| br ^bb23(%61 : index) | |
| ^bb28: // pred: ^bb23 | |
| %62 = addi %49, %c1 : index | |
| br ^bb21(%62 : index) | |
| ^bb29: // pred: ^bb21 | |
| %63 = addi %41, %c4 : index | |
| br ^bb19(%63 : index) | |
| ^bb30: // pred: ^bb19 | |
| %64 = addi %39, %c4 : index | |
| br ^bb17(%64 : index) | |
| ^bb31: // pred: ^bb17 | |
| %65 = addi %37, %c4 : index | |
| br ^bb15(%65 : index) | |
| ^bb32: // pred: ^bb15 | |
| %66 = addi %30, %c32 : index | |
| br ^bb13(%66 : index) | |
| ^bb33: // pred: ^bb13 | |
| %67 = addi %28, %c32 : index | |
| br ^bb11(%67 : index) | |
| ^bb34: // pred: ^bb11 | |
| %68 = addi %9, %8 : index | |
| br ^bb3(%68 : index) | |
| ^bb35: // pred: ^bb3 | |
| %69 = addi %5, %4 : index | |
| br ^bb1(%69 : index) | |
| ^bb36: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%1 : index) | |
| ^bb1(%3: index): // 2 preds: ^bb0, ^bb10 | |
| %4 = cmpi slt, %3, %c10 : index | |
| cond_br %4, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%5 : index) | |
| ^bb3(%7: index): // 2 preds: ^bb2, ^bb9 | |
| %8 = cmpi slt, %7, %c10 : index | |
| cond_br %8, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3] | |
| %10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7] | |
| %11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%12: index): // 2 preds: ^bb4, ^bb8 | |
| %13 = cmpi slt, %12, %9 : index | |
| cond_br %13, ^bb6(%c0 : index), ^bb9 | |
| ^bb6(%14: index): // 2 preds: ^bb5, ^bb7 | |
| %15 = cmpi slt, %14, %10 : index | |
| cond_br %15, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3] | |
| %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7] | |
| %18 = cmpi eq, %16, %17 : index | |
| %19 = select %18, %cst, %cst_0 : f32 | |
| memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %20 = addi %14, %c1 : index | |
| br ^bb6(%20 : index) | |
| ^bb8: // pred: ^bb6 | |
| %21 = addi %12, %c1 : index | |
| br ^bb5(%21 : index) | |
| ^bb9: // pred: ^bb5 | |
| %22 = addi %7, %6 : index | |
| br ^bb3(%22 : index) | |
| ^bb10: // pred: ^bb3 | |
| %23 = addi %3, %2 : index | |
| br ^bb1(%23 : index) | |
| ^bb11: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%1 : index) | |
| ^bb1(%3: index): // 2 preds: ^bb0, ^bb10 | |
| %4 = cmpi slt, %3, %c10 : index | |
| cond_br %4, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%5 : index) | |
| ^bb3(%7: index): // 2 preds: ^bb2, ^bb9 | |
| %8 = cmpi slt, %7, %c10 : index | |
| cond_br %8, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3] | |
| %10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7] | |
| %11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%12: index): // 2 preds: ^bb4, ^bb8 | |
| %13 = cmpi slt, %12, %9 : index | |
| cond_br %13, ^bb6(%c0 : index), ^bb9 | |
| ^bb6(%14: index): // 2 preds: ^bb5, ^bb7 | |
| %15 = cmpi slt, %14, %10 : index | |
| cond_br %15, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3] | |
| %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7] | |
| %18 = cmpi eq, %16, %17 : index | |
| %19 = select %18, %cst, %cst_0 : f32 | |
| memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %20 = addi %14, %c1 : index | |
| br ^bb6(%20 : index) | |
| ^bb8: // pred: ^bb6 | |
| %21 = addi %12, %c1 : index | |
| br ^bb5(%21 : index) | |
| ^bb9: // pred: ^bb5 | |
| %22 = addi %7, %6 : index | |
| br ^bb3(%22 : index) | |
| ^bb10: // pred: ^bb3 | |
| %23 = addi %3, %2 : index | |
| br ^bb1(%23 : index) | |
| ^bb11: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%3 : index) | |
| ^bb1(%5: index): // 2 preds: ^bb0, ^bb28 | |
| %6 = cmpi slt, %5, %c10 : index | |
| cond_br %6, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%7 : index) | |
| ^bb3(%9: index): // 2 preds: ^bb2, ^bb27 | |
| %10 = cmpi slt, %9, %c10 : index | |
| cond_br %10, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5] | |
| %12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9] | |
| %14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5] | |
| %16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9] | |
| %17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%18: index): // 2 preds: ^bb4, ^bb8 | |
| %19 = cmpi slt, %18, %15 : index | |
| cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index) | |
| ^bb6(%20: index): // 2 preds: ^bb5, ^bb7 | |
| %21 = cmpi slt, %20, %16 : index | |
| cond_br %21, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5] | |
| %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9] | |
| %24 = cmpi eq, %22, %23 : index | |
| %25 = select %24, %cst, %cst_0 : f32 | |
| memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = addi %20, %c1 : index | |
| br ^bb6(%26 : index) | |
| ^bb8: // pred: ^bb6 | |
| %27 = addi %18, %c1 : index | |
| br ^bb5(%27 : index) | |
| ^bb9(%28: index): // 2 preds: ^bb5, ^bb26 | |
| %29 = cmpi slt, %28, %11 : index | |
| cond_br %29, ^bb10(%c0 : index), ^bb27 | |
| ^bb10(%30: index): // 2 preds: ^bb9, ^bb25 | |
| %31 = cmpi slt, %30, %13 : index | |
| cond_br %31, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28] | |
| %33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30] | |
| %35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb12(%c0 : index) | |
| ^bb12(%37: index): // 2 preds: ^bb11, ^bb24 | |
| %38 = cmpi slt, %37, %32 : index | |
| cond_br %38, ^bb13(%c0 : index), ^bb25 | |
| ^bb13(%39: index): // 2 preds: ^bb12, ^bb23 | |
| %40 = cmpi slt, %39, %34 : index | |
| cond_br %40, ^bb14(%c0 : index), ^bb24 | |
| ^bb14(%41: index): // 2 preds: ^bb13, ^bb22 | |
| %42 = cmpi slt, %41, %c10 : index | |
| cond_br %42, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37] | |
| %44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41] | |
| %45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39] | |
| %47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb16(%c0 : index) | |
| ^bb16(%49: index): // 2 preds: ^bb15, ^bb21 | |
| %50 = cmpi slt, %49, %43 : index | |
| cond_br %50, ^bb17(%c0 : index), ^bb22 | |
| ^bb17(%51: index): // 2 preds: ^bb16, ^bb20 | |
| %52 = cmpi slt, %51, %46 : index | |
| cond_br %52, ^bb18(%c0 : index), ^bb21 | |
| ^bb18(%53: index): // 2 preds: ^bb17, ^bb19 | |
| %54 = cmpi slt, %53, %44 : index | |
| cond_br %54, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %58 = mulf %55, %56 : f32 | |
| %59 = addf %57, %58 : f32 | |
| memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %60 = addi %53, %c1 : index | |
| br ^bb18(%60 : index) | |
| ^bb20: // pred: ^bb18 | |
| %61 = addi %51, %c1 : index | |
| br ^bb17(%61 : index) | |
| ^bb21: // pred: ^bb17 | |
| %62 = addi %49, %c1 : index | |
| br ^bb16(%62 : index) | |
| ^bb22: // pred: ^bb16 | |
| %63 = addi %41, %c4 : index | |
| br ^bb14(%63 : index) | |
| ^bb23: // pred: ^bb14 | |
| %64 = addi %39, %c4 : index | |
| br ^bb13(%64 : index) | |
| ^bb24: // pred: ^bb13 | |
| %65 = addi %37, %c4 : index | |
| br ^bb12(%65 : index) | |
| ^bb25: // pred: ^bb12 | |
| %66 = addi %30, %c32 : index | |
| br ^bb10(%66 : index) | |
| ^bb26: // pred: ^bb10 | |
| %67 = addi %28, %c32 : index | |
| br ^bb9(%67 : index) | |
| ^bb27: // pred: ^bb9 | |
| %68 = addi %9, %8 : index | |
| br ^bb3(%68 : index) | |
| ^bb28: // pred: ^bb3 | |
| %69 = addi %5, %4 : index | |
| br ^bb1(%69 : index) | |
| ^bb29: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After TensorConstantBufferize //----- // | |
| module { | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%1 : index) | |
| ^bb1(%3: index): // 2 preds: ^bb0, ^bb10 | |
| %4 = cmpi slt, %3, %c10 : index | |
| cond_br %4, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%5 : index) | |
| ^bb3(%7: index): // 2 preds: ^bb2, ^bb9 | |
| %8 = cmpi slt, %7, %c10 : index | |
| cond_br %8, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3] | |
| %10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7] | |
| %11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%12: index): // 2 preds: ^bb4, ^bb8 | |
| %13 = cmpi slt, %12, %9 : index | |
| cond_br %13, ^bb6(%c0 : index), ^bb9 | |
| ^bb6(%14: index): // 2 preds: ^bb5, ^bb7 | |
| %15 = cmpi slt, %14, %10 : index | |
| cond_br %15, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3] | |
| %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7] | |
| %18 = cmpi eq, %16, %17 : index | |
| %19 = select %18, %cst, %cst_0 : f32 | |
| memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %20 = addi %14, %c1 : index | |
| br ^bb6(%20 : index) | |
| ^bb8: // pred: ^bb6 | |
| %21 = addi %12, %c1 : index | |
| br ^bb5(%21 : index) | |
| ^bb9: // pred: ^bb5 | |
| %22 = addi %7, %6 : index | |
| br ^bb3(%22 : index) | |
| ^bb10: // pred: ^bb3 | |
| %23 = addi %3, %2 : index | |
| br ^bb1(%23 : index) | |
| ^bb11: // pred: ^bb1 | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%3 : index) | |
| ^bb1(%5: index): // 2 preds: ^bb0, ^bb28 | |
| %6 = cmpi slt, %5, %c10 : index | |
| cond_br %6, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%7 : index) | |
| ^bb3(%9: index): // 2 preds: ^bb2, ^bb27 | |
| %10 = cmpi slt, %9, %c10 : index | |
| cond_br %10, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5] | |
| %12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9] | |
| %14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5] | |
| %16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9] | |
| %17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%18: index): // 2 preds: ^bb4, ^bb8 | |
| %19 = cmpi slt, %18, %15 : index | |
| cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index) | |
| ^bb6(%20: index): // 2 preds: ^bb5, ^bb7 | |
| %21 = cmpi slt, %20, %16 : index | |
| cond_br %21, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5] | |
| %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9] | |
| %24 = cmpi eq, %22, %23 : index | |
| %25 = select %24, %cst, %cst_0 : f32 | |
| memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = addi %20, %c1 : index | |
| br ^bb6(%26 : index) | |
| ^bb8: // pred: ^bb6 | |
| %27 = addi %18, %c1 : index | |
| br ^bb5(%27 : index) | |
| ^bb9(%28: index): // 2 preds: ^bb5, ^bb26 | |
| %29 = cmpi slt, %28, %11 : index | |
| cond_br %29, ^bb10(%c0 : index), ^bb27 | |
| ^bb10(%30: index): // 2 preds: ^bb9, ^bb25 | |
| %31 = cmpi slt, %30, %13 : index | |
| cond_br %31, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28] | |
| %33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30] | |
| %35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb12(%c0 : index) | |
| ^bb12(%37: index): // 2 preds: ^bb11, ^bb24 | |
| %38 = cmpi slt, %37, %32 : index | |
| cond_br %38, ^bb13(%c0 : index), ^bb25 | |
| ^bb13(%39: index): // 2 preds: ^bb12, ^bb23 | |
| %40 = cmpi slt, %39, %34 : index | |
| cond_br %40, ^bb14(%c0 : index), ^bb24 | |
| ^bb14(%41: index): // 2 preds: ^bb13, ^bb22 | |
| %42 = cmpi slt, %41, %c10 : index | |
| cond_br %42, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37] | |
| %44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41] | |
| %45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39] | |
| %47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb16(%c0 : index) | |
| ^bb16(%49: index): // 2 preds: ^bb15, ^bb21 | |
| %50 = cmpi slt, %49, %43 : index | |
| cond_br %50, ^bb17(%c0 : index), ^bb22 | |
| ^bb17(%51: index): // 2 preds: ^bb16, ^bb20 | |
| %52 = cmpi slt, %51, %46 : index | |
| cond_br %52, ^bb18(%c0 : index), ^bb21 | |
| ^bb18(%53: index): // 2 preds: ^bb17, ^bb19 | |
| %54 = cmpi slt, %53, %44 : index | |
| cond_br %54, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %58 = mulf %55, %56 : f32 | |
| %59 = addf %57, %58 : f32 | |
| memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %60 = addi %53, %c1 : index | |
| br ^bb18(%60 : index) | |
| ^bb20: // pred: ^bb18 | |
| %61 = addi %51, %c1 : index | |
| br ^bb17(%61 : index) | |
| ^bb21: // pred: ^bb17 | |
| %62 = addi %49, %c1 : index | |
| br ^bb16(%62 : index) | |
| ^bb22: // pred: ^bb16 | |
| %63 = addi %41, %c4 : index | |
| br ^bb14(%63 : index) | |
| ^bb23: // pred: ^bb14 | |
| %64 = addi %39, %c4 : index | |
| br ^bb13(%64 : index) | |
| ^bb24: // pred: ^bb13 | |
| %65 = addi %37, %c4 : index | |
| br ^bb12(%65 : index) | |
| ^bb25: // pred: ^bb12 | |
| %66 = addi %30, %c32 : index | |
| br ^bb10(%66 : index) | |
| ^bb26: // pred: ^bb10 | |
| %67 = addi %28, %c32 : index | |
| br ^bb9(%67 : index) | |
| ^bb27: // pred: ^bb9 | |
| %68 = addi %9, %8 : index | |
| br ^bb3(%68 : index) | |
| ^bb28: // pred: ^bb3 | |
| %69 = addi %5, %4 : index | |
| br ^bb1(%69 : index) | |
| ^bb29: // pred: ^bb1 | |
| return | |
| } | |
| // -----// IR Dump After FoldTensorExtractOp //----- // | |
| module { | |
| func @matmul_test_dispatch_0() { | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%1 : index) | |
| ^bb1(%3: index): // 2 preds: ^bb0, ^bb10 | |
| %4 = cmpi slt, %3, %c10 : index | |
| cond_br %4, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%5 : index) | |
| ^bb3(%7: index): // 2 preds: ^bb2, ^bb9 | |
| %8 = cmpi slt, %7, %c10 : index | |
| cond_br %8, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3] | |
| %10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7] | |
| %11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%12: index): // 2 preds: ^bb4, ^bb8 | |
| %13 = cmpi slt, %12, %9 : index | |
| cond_br %13, ^bb6(%c0 : index), ^bb9 | |
| ^bb6(%14: index): // 2 preds: ^bb5, ^bb7 | |
| %15 = cmpi slt, %14, %10 : index | |
| cond_br %15, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3] | |
| %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7] | |
| %18 = cmpi eq, %16, %17 : index | |
| %19 = select %18, %cst, %cst_0 : f32 | |
| memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %20 = addi %14, %c1 : index | |
| br ^bb6(%20 : index) | |
| ^bb8: // pred: ^bb6 | |
| %21 = addi %12, %c1 : index | |
| br ^bb5(%21 : index) | |
| ^bb9: // pred: ^bb5 | |
| %22 = addi %7, %6 : index | |
| br ^bb3(%22 : index) | |
| ^bb10: // pred: ^bb3 | |
| %23 = addi %3, %2 : index | |
| br ^bb1(%23 : index) | |
| ^bb11: // pred: ^bb1 | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After TensorConstantBufferize //----- // | |
| module { | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%3 : index) | |
| ^bb1(%5: index): // 2 preds: ^bb0, ^bb28 | |
| %6 = cmpi slt, %5, %c10 : index | |
| cond_br %6, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%7 : index) | |
| ^bb3(%9: index): // 2 preds: ^bb2, ^bb27 | |
| %10 = cmpi slt, %9, %c10 : index | |
| cond_br %10, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5] | |
| %12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9] | |
| %14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5] | |
| %16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9] | |
| %17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%18: index): // 2 preds: ^bb4, ^bb8 | |
| %19 = cmpi slt, %18, %15 : index | |
| cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index) | |
| ^bb6(%20: index): // 2 preds: ^bb5, ^bb7 | |
| %21 = cmpi slt, %20, %16 : index | |
| cond_br %21, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5] | |
| %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9] | |
| %24 = cmpi eq, %22, %23 : index | |
| %25 = select %24, %cst, %cst_0 : f32 | |
| memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = addi %20, %c1 : index | |
| br ^bb6(%26 : index) | |
| ^bb8: // pred: ^bb6 | |
| %27 = addi %18, %c1 : index | |
| br ^bb5(%27 : index) | |
| ^bb9(%28: index): // 2 preds: ^bb5, ^bb26 | |
| %29 = cmpi slt, %28, %11 : index | |
| cond_br %29, ^bb10(%c0 : index), ^bb27 | |
| ^bb10(%30: index): // 2 preds: ^bb9, ^bb25 | |
| %31 = cmpi slt, %30, %13 : index | |
| cond_br %31, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28] | |
| %33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30] | |
| %35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb12(%c0 : index) | |
| ^bb12(%37: index): // 2 preds: ^bb11, ^bb24 | |
| %38 = cmpi slt, %37, %32 : index | |
| cond_br %38, ^bb13(%c0 : index), ^bb25 | |
| ^bb13(%39: index): // 2 preds: ^bb12, ^bb23 | |
| %40 = cmpi slt, %39, %34 : index | |
| cond_br %40, ^bb14(%c0 : index), ^bb24 | |
| ^bb14(%41: index): // 2 preds: ^bb13, ^bb22 | |
| %42 = cmpi slt, %41, %c10 : index | |
| cond_br %42, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37] | |
| %44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41] | |
| %45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39] | |
| %47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb16(%c0 : index) | |
| ^bb16(%49: index): // 2 preds: ^bb15, ^bb21 | |
| %50 = cmpi slt, %49, %43 : index | |
| cond_br %50, ^bb17(%c0 : index), ^bb22 | |
| ^bb17(%51: index): // 2 preds: ^bb16, ^bb20 | |
| %52 = cmpi slt, %51, %46 : index | |
| cond_br %52, ^bb18(%c0 : index), ^bb21 | |
| ^bb18(%53: index): // 2 preds: ^bb17, ^bb19 | |
| %54 = cmpi slt, %53, %44 : index | |
| cond_br %54, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %58 = mulf %55, %56 : f32 | |
| %59 = addf %57, %58 : f32 | |
| memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %60 = addi %53, %c1 : index | |
| br ^bb18(%60 : index) | |
| ^bb20: // pred: ^bb18 | |
| %61 = addi %51, %c1 : index | |
| br ^bb17(%61 : index) | |
| ^bb21: // pred: ^bb17 | |
| %62 = addi %49, %c1 : index | |
| br ^bb16(%62 : index) | |
| ^bb22: // pred: ^bb16 | |
| %63 = addi %41, %c4 : index | |
| br ^bb14(%63 : index) | |
| ^bb23: // pred: ^bb14 | |
| %64 = addi %39, %c4 : index | |
| br ^bb13(%64 : index) | |
| ^bb24: // pred: ^bb13 | |
| %65 = addi %37, %c4 : index | |
| br ^bb12(%65 : index) | |
| ^bb25: // pred: ^bb12 | |
| %66 = addi %30, %c32 : index | |
| br ^bb10(%66 : index) | |
| ^bb26: // pred: ^bb10 | |
| %67 = addi %28, %c32 : index | |
| br ^bb9(%67 : index) | |
| ^bb27: // pred: ^bb9 | |
| %68 = addi %9, %8 : index | |
| br ^bb3(%68 : index) | |
| ^bb28: // pred: ^bb3 | |
| %69 = addi %5, %4 : index | |
| br ^bb1(%69 : index) | |
| ^bb29: // pred: ^bb1 | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After FoldTensorExtractOp //----- // | |
| module { | |
| func @matmul_test_dispatch_2() { | |
| %c4 = constant 4 : index | |
| %c32 = constant 32 : index | |
| %c0 = constant 0 : index | |
| %c10 = constant 10 : index | |
| %cst = constant 0.000000e+00 : f32 | |
| %cst_0 = constant 1.000000e+00 : f32 | |
| %c1 = constant 1 : index | |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32> | |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32> | |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] | |
| br ^bb1(%3 : index) | |
| ^bb1(%5: index): // 2 preds: ^bb0, ^bb28 | |
| %6 = cmpi slt, %5, %c10 : index | |
| cond_br %6, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
| br ^bb3(%7 : index) | |
| ^bb3(%9: index): // 2 preds: ^bb2, ^bb27 | |
| %10 = cmpi slt, %9, %c10 : index | |
| cond_br %10, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5] | |
| %12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9] | |
| %14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5] | |
| %16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9] | |
| %17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb5(%c0 : index) | |
| ^bb5(%18: index): // 2 preds: ^bb4, ^bb8 | |
| %19 = cmpi slt, %18, %15 : index | |
| cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index) | |
| ^bb6(%20: index): // 2 preds: ^bb5, ^bb7 | |
| %21 = cmpi slt, %20, %16 : index | |
| cond_br %21, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5] | |
| %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9] | |
| %24 = cmpi eq, %22, %23 : index | |
| %25 = select %24, %cst, %cst_0 : f32 | |
| memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %26 = addi %20, %c1 : index | |
| br ^bb6(%26 : index) | |
| ^bb8: // pred: ^bb6 | |
| %27 = addi %18, %c1 : index | |
| br ^bb5(%27 : index) | |
| ^bb9(%28: index): // 2 preds: ^bb5, ^bb26 | |
| %29 = cmpi slt, %28, %11 : index | |
| cond_br %29, ^bb10(%c0 : index), ^bb27 | |
| ^bb10(%30: index): // 2 preds: ^bb9, ^bb25 | |
| %31 = cmpi slt, %30, %13 : index | |
| cond_br %31, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28] | |
| %33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30] | |
| %35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb12(%c0 : index) | |
| ^bb12(%37: index): // 2 preds: ^bb11, ^bb24 | |
| %38 = cmpi slt, %37, %32 : index | |
| cond_br %38, ^bb13(%c0 : index), ^bb25 | |
| ^bb13(%39: index): // 2 preds: ^bb12, ^bb23 | |
| %40 = cmpi slt, %39, %34 : index | |
| cond_br %40, ^bb14(%c0 : index), ^bb24 | |
| ^bb14(%41: index): // 2 preds: ^bb13, ^bb22 | |
| %42 = cmpi slt, %41, %c10 : index | |
| cond_br %42, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37] | |
| %44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41] | |
| %45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39] | |
| %47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| br ^bb16(%c0 : index) | |
| ^bb16(%49: index): // 2 preds: ^bb15, ^bb21 | |
| %50 = cmpi slt, %49, %43 : index | |
| cond_br %50, ^bb17(%c0 : index), ^bb22 | |
| ^bb17(%51: index): // 2 preds: ^bb16, ^bb20 | |
| %52 = cmpi slt, %51, %46 : index | |
| cond_br %52, ^bb18(%c0 : index), ^bb21 | |
| ^bb18(%53: index): // 2 preds: ^bb17, ^bb19 | |
| %54 = cmpi slt, %53, %44 : index | |
| cond_br %54, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %58 = mulf %55, %56 : f32 | |
| %59 = addf %57, %58 : f32 | |
| memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> | |
| %60 = addi %53, %c1 : index | |
| br ^bb18(%60 : index) | |
| ^bb20: // pred: ^bb18 | |
| %61 = addi %51, %c1 : index | |
| br ^bb17(%61 : index) | |
| ^bb21: // pred: ^bb17 | |
| %62 = addi %49, %c1 : index | |
| br ^bb16(%62 : index) | |
| ^bb22: // pred: ^bb16 | |
| %63 = addi %41, %c4 : index | |
| br ^bb14(%63 : index) | |
| ^bb23: // pred: ^bb14 | |
| %64 = addi %39, %c4 : index | |
| br ^bb13(%64 : index) | |
| ^bb24: // pred: ^bb13 | |
| %65 = addi %37, %c4 : index | |
| br ^bb12(%65 : index) | |
| ^bb25: // pred: ^bb12 | |
| %66 = addi %30, %c32 : index | |
| br ^bb10(%66 : index) | |
| ^bb26: // pred: ^bb10 | |
| %67 = addi %28, %c32 : index | |
| br ^bb9(%67 : index) | |
| ^bb27: // pred: ^bb9 | |
| %68 = addi %9, %8 : index | |
| br ^bb3(%68 : index) | |
| ^bb28: // pred: ^bb3 | |
| %69 = addi %5, %4 : index | |
| br ^bb1(%69 : index) | |
| ^bb29: // pred: ^bb1 | |
| return | |
| } | |
| hal.interface private @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| } | |
| // -----// IR Dump After ConvertToLLVM //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %15 = llvm.mlir.constant(0 : index) : i64 | |
| %16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %17 = llvm.mlir.constant(10 : index) : i64 | |
| %18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %19 = llvm.mlir.constant(10 : index) : i64 | |
| %20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %21 = llvm.mlir.constant(10 : index) : i64 | |
| %22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %23 = llvm.mlir.constant(1 : index) : i64 | |
| %24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %25 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %26 = llvm.extractvalue %25[0] : !llvm.array<3 x i32> | |
| %27 = llvm.zext %26 : i32 to i64 | |
| %28 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %29 = llvm.extractvalue %28[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %30 = llvm.extractvalue %29[0] : !llvm.array<3 x i32> | |
| %31 = llvm.zext %30 : i32 to i64 | |
| %32 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %33 = llvm.extractvalue %32[1] : !llvm.array<3 x i32> | |
| %34 = llvm.zext %33 : i32 to i64 | |
| %35 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %36 = llvm.extractvalue %35[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %37 = llvm.extractvalue %36[1] : !llvm.array<3 x i32> | |
| %38 = llvm.zext %37 : i32 to i64 | |
| %39 = llvm.mlir.constant(64 : index) : i64 | |
| %40 = llvm.mul %34, %39 : i64 | |
| %41 = llvm.mlir.constant(64 : index) : i64 | |
| %42 = llvm.mul %38, %41 : i64 | |
| llvm.br ^bb1(%40 : i64) | |
| ^bb1(%43: i64): // 2 preds: ^bb0, ^bb10 | |
| %44 = llvm.icmp "slt" %43, %1 : i64 | |
| llvm.cond_br %44, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %45 = llvm.mlir.constant(64 : index) : i64 | |
| %46 = llvm.mul %27, %45 : i64 | |
| %47 = llvm.mlir.constant(64 : index) : i64 | |
| %48 = llvm.mul %31, %47 : i64 | |
| llvm.br ^bb3(%46 : i64) | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb9 | |
| %50 = llvm.icmp "slt" %49, %1 : i64 | |
| llvm.cond_br %50, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %51 = llvm.mlir.constant(64 : index) : i64 | |
| %52 = llvm.mlir.constant(-1 : index) : i64 | |
| %53 = llvm.mul %43, %52 : i64 | |
| %54 = llvm.mlir.constant(10 : index) : i64 | |
| %55 = llvm.add %53, %54 : i64 | |
| %56 = llvm.icmp "slt" %51, %55 : i64 | |
| %57 = llvm.select %56, %51, %55 : i1, i64 | |
| %58 = llvm.mlir.constant(64 : index) : i64 | |
| %59 = llvm.mlir.constant(-1 : index) : i64 | |
| %60 = llvm.mul %49, %59 : i64 | |
| %61 = llvm.mlir.constant(10 : index) : i64 | |
| %62 = llvm.add %60, %61 : i64 | |
| %63 = llvm.icmp "slt" %58, %62 : i64 | |
| %64 = llvm.select %63, %58, %62 : i1, i64 | |
| %65 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %66 = llvm.extractvalue %24[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %67 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %68 = llvm.insertvalue %67, %65[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %69 = llvm.extractvalue %24[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %70 = llvm.bitcast %69 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %71 = llvm.insertvalue %70, %68[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %72 = llvm.extractvalue %24[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %73 = llvm.extractvalue %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %74 = llvm.extractvalue %24[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %75 = llvm.mul %43, %72 : i64 | |
| %76 = llvm.add %74, %75 : i64 | |
| %77 = llvm.mul %49, %73 : i64 | |
| %78 = llvm.add %76, %77 : i64 | |
| %79 = llvm.insertvalue %78, %71[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %80 = llvm.mlir.constant(1 : i64) : i64 | |
| %81 = llvm.insertvalue %64, %79[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %82 = llvm.insertvalue %80, %81[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %83 = llvm.mlir.constant(10 : i64) : i64 | |
| %84 = llvm.insertvalue %57, %82[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%86: i64): // 2 preds: ^bb4, ^bb8 | |
| %87 = llvm.icmp "slt" %86, %57 : i64 | |
| llvm.cond_br %87, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%88: i64): // 2 preds: ^bb5, ^bb7 | |
| %89 = llvm.icmp "slt" %88, %64 : i64 | |
| llvm.cond_br %89, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %90 = llvm.add %86, %43 : i64 | |
| %91 = llvm.add %88, %49 : i64 | |
| %92 = llvm.icmp "eq" %90, %91 : i64 | |
| %93 = llvm.select %92, %2, %3 : i1, f32 | |
| %94 = llvm.extractvalue %85[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %95 = llvm.extractvalue %85[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %96 = llvm.mlir.constant(10 : index) : i64 | |
| %97 = llvm.mul %86, %96 : i64 | |
| %98 = llvm.add %95, %97 : i64 | |
| %99 = llvm.add %98, %88 : i64 | |
| %100 = llvm.getelementptr %94[%99] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %93, %100 : !llvm.ptr<f32> | |
| %101 = llvm.add %88, %4 : i64 | |
| llvm.br ^bb6(%101 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %102 = llvm.add %86, %4 : i64 | |
| llvm.br ^bb5(%102 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %103 = llvm.add %49, %48 : i64 | |
| llvm.br ^bb3(%103 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %104 = llvm.add %43, %42 : i64 | |
| llvm.br ^bb1(%104 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %105 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %105 : i32 | |
| } | |
| } | |
| // -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %15 = llvm.mlir.constant(0 : index) : i64 | |
| %16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %17 = llvm.mlir.constant(10 : index) : i64 | |
| %18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %19 = llvm.mlir.constant(10 : index) : i64 | |
| %20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %21 = llvm.mlir.constant(10 : index) : i64 | |
| %22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %23 = llvm.mlir.constant(1 : index) : i64 | |
| %24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %25 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %26 = llvm.extractvalue %25[0] : !llvm.array<3 x i32> | |
| %27 = llvm.zext %26 : i32 to i64 | |
| %28 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %29 = llvm.extractvalue %28[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %30 = llvm.extractvalue %29[0] : !llvm.array<3 x i32> | |
| %31 = llvm.zext %30 : i32 to i64 | |
| %32 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %33 = llvm.extractvalue %32[1] : !llvm.array<3 x i32> | |
| %34 = llvm.zext %33 : i32 to i64 | |
| %35 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %36 = llvm.extractvalue %35[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %37 = llvm.extractvalue %36[1] : !llvm.array<3 x i32> | |
| %38 = llvm.zext %37 : i32 to i64 | |
| %39 = llvm.mlir.constant(64 : index) : i64 | |
| %40 = llvm.mul %34, %39 : i64 | |
| %41 = llvm.mlir.constant(64 : index) : i64 | |
| %42 = llvm.mul %38, %41 : i64 | |
| llvm.br ^bb1(%40 : i64) | |
| ^bb1(%43: i64): // 2 preds: ^bb0, ^bb10 | |
| %44 = llvm.icmp "slt" %43, %1 : i64 | |
| llvm.cond_br %44, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %45 = llvm.mlir.constant(64 : index) : i64 | |
| %46 = llvm.mul %27, %45 : i64 | |
| %47 = llvm.mlir.constant(64 : index) : i64 | |
| %48 = llvm.mul %31, %47 : i64 | |
| llvm.br ^bb3(%46 : i64) | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb9 | |
| %50 = llvm.icmp "slt" %49, %1 : i64 | |
| llvm.cond_br %50, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %51 = llvm.mlir.constant(64 : index) : i64 | |
| %52 = llvm.mlir.constant(-1 : index) : i64 | |
| %53 = llvm.mul %43, %52 : i64 | |
| %54 = llvm.mlir.constant(10 : index) : i64 | |
| %55 = llvm.add %53, %54 : i64 | |
| %56 = llvm.icmp "slt" %51, %55 : i64 | |
| %57 = llvm.select %56, %51, %55 : i1, i64 | |
| %58 = llvm.mlir.constant(64 : index) : i64 | |
| %59 = llvm.mlir.constant(-1 : index) : i64 | |
| %60 = llvm.mul %49, %59 : i64 | |
| %61 = llvm.mlir.constant(10 : index) : i64 | |
| %62 = llvm.add %60, %61 : i64 | |
| %63 = llvm.icmp "slt" %58, %62 : i64 | |
| %64 = llvm.select %63, %58, %62 : i1, i64 | |
| %65 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %66 = llvm.extractvalue %24[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %67 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %68 = llvm.insertvalue %67, %65[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %69 = llvm.extractvalue %24[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %70 = llvm.bitcast %69 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %71 = llvm.insertvalue %70, %68[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %72 = llvm.extractvalue %24[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %73 = llvm.extractvalue %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %74 = llvm.extractvalue %24[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %75 = llvm.mul %43, %72 : i64 | |
| %76 = llvm.add %74, %75 : i64 | |
| %77 = llvm.mul %49, %73 : i64 | |
| %78 = llvm.add %76, %77 : i64 | |
| %79 = llvm.insertvalue %78, %71[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %80 = llvm.mlir.constant(1 : i64) : i64 | |
| %81 = llvm.insertvalue %64, %79[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %82 = llvm.insertvalue %80, %81[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %83 = llvm.mlir.constant(10 : i64) : i64 | |
| %84 = llvm.insertvalue %57, %82[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%86: i64): // 2 preds: ^bb4, ^bb8 | |
| %87 = llvm.icmp "slt" %86, %57 : i64 | |
| llvm.cond_br %87, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%88: i64): // 2 preds: ^bb5, ^bb7 | |
| %89 = llvm.icmp "slt" %88, %64 : i64 | |
| llvm.cond_br %89, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %90 = llvm.add %86, %43 : i64 | |
| %91 = llvm.add %88, %49 : i64 | |
| %92 = llvm.icmp "eq" %90, %91 : i64 | |
| %93 = llvm.select %92, %2, %3 : i1, f32 | |
| %94 = llvm.extractvalue %85[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %95 = llvm.extractvalue %85[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %96 = llvm.mlir.constant(10 : index) : i64 | |
| %97 = llvm.mul %86, %96 : i64 | |
| %98 = llvm.add %95, %97 : i64 | |
| %99 = llvm.add %98, %88 : i64 | |
| %100 = llvm.getelementptr %94[%99] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %93, %100 : !llvm.ptr<f32> | |
| %101 = llvm.add %88, %4 : i64 | |
| llvm.br ^bb6(%101 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %102 = llvm.add %86, %4 : i64 | |
| llvm.br ^bb5(%102 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %103 = llvm.add %49, %48 : i64 | |
| llvm.br ^bb3(%103 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %104 = llvm.add %43, %42 : i64 | |
| llvm.br ^bb1(%104 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %105 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %105 : i32 | |
| } | |
| } | |
| // -----// IR Dump After ConvertToLLVM //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %17 = llvm.mlir.constant(0 : index) : i64 | |
| %18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %19 = llvm.mlir.constant(10 : index) : i64 | |
| %20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %21 = llvm.mlir.constant(10 : index) : i64 | |
| %22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %23 = llvm.mlir.constant(10 : index) : i64 | |
| %24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %25 = llvm.mlir.constant(1 : index) : i64 | |
| %26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %29 = llvm.mlir.constant(1 : i64) : i64 | |
| %30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %31 = llvm.load %30 : !llvm.ptr<ptr<i8>> | |
| %32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %34 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %35 = llvm.insertvalue %33, %34[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %36 = llvm.insertvalue %33, %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %37 = llvm.mlir.constant(0 : index) : i64 | |
| %38 = llvm.insertvalue %37, %36[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %39 = llvm.mlir.constant(10 : index) : i64 | |
| %40 = llvm.insertvalue %39, %38[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %41 = llvm.mlir.constant(10 : index) : i64 | |
| %42 = llvm.insertvalue %41, %40[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %43 = llvm.mlir.constant(10 : index) : i64 | |
| %44 = llvm.insertvalue %43, %42[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %45 = llvm.mlir.constant(1 : index) : i64 | |
| %46 = llvm.insertvalue %45, %44[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %48 = llvm.extractvalue %47[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %49 = llvm.mlir.constant(2 : i64) : i64 | |
| %50 = llvm.getelementptr %48[%49] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %51 = llvm.load %50 : !llvm.ptr<ptr<i8>> | |
| %52 = llvm.getelementptr %51[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %53 = llvm.bitcast %52 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %54 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %55 = llvm.insertvalue %53, %54[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %56 = llvm.insertvalue %53, %55[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %57 = llvm.mlir.constant(0 : index) : i64 | |
| %58 = llvm.insertvalue %57, %56[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %59 = llvm.mlir.constant(10 : index) : i64 | |
| %60 = llvm.insertvalue %59, %58[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %61 = llvm.mlir.constant(10 : index) : i64 | |
| %62 = llvm.insertvalue %61, %60[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %63 = llvm.mlir.constant(10 : index) : i64 | |
| %64 = llvm.insertvalue %63, %62[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %65 = llvm.mlir.constant(1 : index) : i64 | |
| %66 = llvm.insertvalue %65, %64[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %67 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %68 = llvm.extractvalue %67[0] : !llvm.array<3 x i32> | |
| %69 = llvm.zext %68 : i32 to i64 | |
| %70 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %71 = llvm.extractvalue %70[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %72 = llvm.extractvalue %71[0] : !llvm.array<3 x i32> | |
| %73 = llvm.zext %72 : i32 to i64 | |
| %74 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %75 = llvm.extractvalue %74[1] : !llvm.array<3 x i32> | |
| %76 = llvm.zext %75 : i32 to i64 | |
| %77 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %78 = llvm.extractvalue %77[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %79 = llvm.extractvalue %78[1] : !llvm.array<3 x i32> | |
| %80 = llvm.zext %79 : i32 to i64 | |
| %81 = llvm.mlir.constant(64 : index) : i64 | |
| %82 = llvm.mul %76, %81 : i64 | |
| %83 = llvm.mlir.constant(64 : index) : i64 | |
| %84 = llvm.mul %80, %83 : i64 | |
| llvm.br ^bb1(%82 : i64) | |
| ^bb1(%85: i64): // 2 preds: ^bb0, ^bb28 | |
| %86 = llvm.icmp "slt" %85, %3 : i64 | |
| llvm.cond_br %86, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %87 = llvm.mlir.constant(64 : index) : i64 | |
| %88 = llvm.mul %69, %87 : i64 | |
| %89 = llvm.mlir.constant(64 : index) : i64 | |
| %90 = llvm.mul %73, %89 : i64 | |
| llvm.br ^bb3(%88 : i64) | |
| ^bb3(%91: i64): // 2 preds: ^bb2, ^bb27 | |
| %92 = llvm.icmp "slt" %91, %3 : i64 | |
| llvm.cond_br %92, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %93 = llvm.mlir.constant(64 : index) : i64 | |
| %94 = llvm.mlir.constant(-1 : index) : i64 | |
| %95 = llvm.mul %85, %94 : i64 | |
| %96 = llvm.mlir.constant(10 : index) : i64 | |
| %97 = llvm.add %95, %96 : i64 | |
| %98 = llvm.icmp "slt" %93, %97 : i64 | |
| %99 = llvm.select %98, %93, %97 : i1, i64 | |
| %100 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %101 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %102 = llvm.bitcast %101 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %103 = llvm.insertvalue %102, %100[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %104 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %105 = llvm.bitcast %104 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %106 = llvm.insertvalue %105, %103[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %107 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %108 = llvm.extractvalue %26[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %109 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %110 = llvm.mul %85, %107 : i64 | |
| %111 = llvm.add %109, %110 : i64 | |
| %112 = llvm.mlir.constant(0 : i64) : i64 | |
| %113 = llvm.mul %112, %108 : i64 | |
| %114 = llvm.add %111, %113 : i64 | |
| %115 = llvm.insertvalue %114, %106[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %116 = llvm.mlir.constant(10 : i64) : i64 | |
| %117 = llvm.mlir.constant(1 : i64) : i64 | |
| %118 = llvm.insertvalue %116, %115[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %119 = llvm.insertvalue %117, %118[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %120 = llvm.mlir.constant(10 : i64) : i64 | |
| %121 = llvm.insertvalue %99, %119[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %123 = llvm.mlir.constant(64 : index) : i64 | |
| %124 = llvm.mlir.constant(-1 : index) : i64 | |
| %125 = llvm.mul %91, %124 : i64 | |
| %126 = llvm.mlir.constant(10 : index) : i64 | |
| %127 = llvm.add %125, %126 : i64 | |
| %128 = llvm.icmp "slt" %123, %127 : i64 | |
| %129 = llvm.select %128, %123, %127 : i1, i64 | |
| %130 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %131 = llvm.extractvalue %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %132 = llvm.bitcast %131 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %133 = llvm.insertvalue %132, %130[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %134 = llvm.extractvalue %46[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %135 = llvm.bitcast %134 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %136 = llvm.insertvalue %135, %133[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %137 = llvm.extractvalue %46[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %138 = llvm.extractvalue %46[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %139 = llvm.extractvalue %46[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %140 = llvm.mlir.constant(0 : i64) : i64 | |
| %141 = llvm.mul %140, %137 : i64 | |
| %142 = llvm.add %139, %141 : i64 | |
| %143 = llvm.mul %91, %138 : i64 | |
| %144 = llvm.add %142, %143 : i64 | |
| %145 = llvm.insertvalue %144, %136[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %146 = llvm.mlir.constant(1 : i64) : i64 | |
| %147 = llvm.insertvalue %129, %145[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %148 = llvm.insertvalue %146, %147[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %149 = llvm.mlir.constant(10 : i64) : i64 | |
| %150 = llvm.mlir.constant(10 : i64) : i64 | |
| %151 = llvm.insertvalue %149, %148[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %152 = llvm.insertvalue %150, %151[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %153 = llvm.mlir.constant(-1 : index) : i64 | |
| %154 = llvm.mul %85, %153 : i64 | |
| %155 = llvm.mlir.constant(10 : index) : i64 | |
| %156 = llvm.add %154, %155 : i64 | |
| %157 = llvm.mlir.constant(64 : index) : i64 | |
| %158 = llvm.icmp "slt" %156, %157 : i64 | |
| %159 = llvm.select %158, %156, %157 : i1, i64 | |
| %160 = llvm.mlir.constant(-1 : index) : i64 | |
| %161 = llvm.mul %91, %160 : i64 | |
| %162 = llvm.mlir.constant(10 : index) : i64 | |
| %163 = llvm.add %161, %162 : i64 | |
| %164 = llvm.mlir.constant(64 : index) : i64 | |
| %165 = llvm.icmp "slt" %163, %164 : i64 | |
| %166 = llvm.select %165, %163, %164 : i1, i64 | |
| %167 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %168 = llvm.extractvalue %66[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %169 = llvm.bitcast %168 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %170 = llvm.insertvalue %169, %167[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %171 = llvm.extractvalue %66[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %172 = llvm.bitcast %171 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %173 = llvm.insertvalue %172, %170[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %174 = llvm.extractvalue %66[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %175 = llvm.extractvalue %66[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %176 = llvm.extractvalue %66[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %177 = llvm.mul %85, %174 : i64 | |
| %178 = llvm.add %176, %177 : i64 | |
| %179 = llvm.mul %91, %175 : i64 | |
| %180 = llvm.add %178, %179 : i64 | |
| %181 = llvm.insertvalue %180, %173[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %182 = llvm.mlir.constant(1 : i64) : i64 | |
| %183 = llvm.insertvalue %129, %181[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %184 = llvm.insertvalue %182, %183[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %185 = llvm.mlir.constant(10 : i64) : i64 | |
| %186 = llvm.insertvalue %99, %184[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %187 = llvm.insertvalue %185, %186[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%188: i64): // 2 preds: ^bb4, ^bb8 | |
| %189 = llvm.icmp "slt" %188, %159 : i64 | |
| llvm.cond_br %189, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%190: i64): // 2 preds: ^bb5, ^bb7 | |
| %191 = llvm.icmp "slt" %190, %166 : i64 | |
| llvm.cond_br %191, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %192 = llvm.add %188, %85 : i64 | |
| %193 = llvm.add %190, %91 : i64 | |
| %194 = llvm.icmp "eq" %192, %193 : i64 | |
| %195 = llvm.select %194, %4, %5 : i1, f32 | |
| %196 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %197 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %198 = llvm.mlir.constant(10 : index) : i64 | |
| %199 = llvm.mul %188, %198 : i64 | |
| %200 = llvm.add %197, %199 : i64 | |
| %201 = llvm.add %200, %190 : i64 | |
| %202 = llvm.getelementptr %196[%201] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %195, %202 : !llvm.ptr<f32> | |
| %203 = llvm.add %190, %6 : i64 | |
| llvm.br ^bb6(%203 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %204 = llvm.add %188, %6 : i64 | |
| llvm.br ^bb5(%204 : i64) | |
| ^bb9(%205: i64): // 2 preds: ^bb5, ^bb26 | |
| %206 = llvm.icmp "slt" %205, %99 : i64 | |
| llvm.cond_br %206, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%207: i64): // 2 preds: ^bb9, ^bb25 | |
| %208 = llvm.icmp "slt" %207, %129 : i64 | |
| llvm.cond_br %208, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %209 = llvm.mlir.constant(32 : index) : i64 | |
| %210 = llvm.mlir.constant(-1 : index) : i64 | |
| %211 = llvm.mul %205, %210 : i64 | |
| %212 = llvm.add %99, %211 : i64 | |
| %213 = llvm.icmp "slt" %209, %212 : i64 | |
| %214 = llvm.select %213, %209, %212 : i1, i64 | |
| %215 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %216 = llvm.extractvalue %122[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %217 = llvm.bitcast %216 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %218 = llvm.insertvalue %217, %215[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %219 = llvm.extractvalue %122[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %220 = llvm.bitcast %219 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %221 = llvm.insertvalue %220, %218[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %222 = llvm.extractvalue %122[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %223 = llvm.extractvalue %122[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %224 = llvm.extractvalue %122[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %225 = llvm.mul %205, %222 : i64 | |
| %226 = llvm.add %224, %225 : i64 | |
| %227 = llvm.mlir.constant(0 : i64) : i64 | |
| %228 = llvm.mul %227, %223 : i64 | |
| %229 = llvm.add %226, %228 : i64 | |
| %230 = llvm.insertvalue %229, %221[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %231 = llvm.mlir.constant(10 : i64) : i64 | |
| %232 = llvm.mlir.constant(1 : i64) : i64 | |
| %233 = llvm.insertvalue %231, %230[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %234 = llvm.insertvalue %232, %233[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %235 = llvm.mlir.constant(10 : i64) : i64 | |
| %236 = llvm.insertvalue %214, %234[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %237 = llvm.insertvalue %235, %236[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %238 = llvm.mlir.constant(32 : index) : i64 | |
| %239 = llvm.mlir.constant(-1 : index) : i64 | |
| %240 = llvm.mul %207, %239 : i64 | |
| %241 = llvm.add %129, %240 : i64 | |
| %242 = llvm.icmp "slt" %238, %241 : i64 | |
| %243 = llvm.select %242, %238, %241 : i1, i64 | |
| %244 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %245 = llvm.extractvalue %152[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %246 = llvm.bitcast %245 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %247 = llvm.insertvalue %246, %244[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %248 = llvm.extractvalue %152[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %249 = llvm.bitcast %248 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %250 = llvm.insertvalue %249, %247[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %251 = llvm.extractvalue %152[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %252 = llvm.extractvalue %152[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %253 = llvm.extractvalue %152[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %254 = llvm.mlir.constant(0 : i64) : i64 | |
| %255 = llvm.mul %254, %251 : i64 | |
| %256 = llvm.add %253, %255 : i64 | |
| %257 = llvm.mul %207, %252 : i64 | |
| %258 = llvm.add %256, %257 : i64 | |
| %259 = llvm.insertvalue %258, %250[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %260 = llvm.mlir.constant(1 : i64) : i64 | |
| %261 = llvm.insertvalue %243, %259[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %262 = llvm.insertvalue %260, %261[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %263 = llvm.mlir.constant(10 : i64) : i64 | |
| %264 = llvm.mlir.constant(10 : i64) : i64 | |
| %265 = llvm.insertvalue %263, %262[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %266 = llvm.insertvalue %264, %265[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %267 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %268 = llvm.extractvalue %187[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %269 = llvm.bitcast %268 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %270 = llvm.insertvalue %269, %267[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %271 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %272 = llvm.bitcast %271 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %273 = llvm.insertvalue %272, %270[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %274 = llvm.extractvalue %187[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %275 = llvm.extractvalue %187[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %276 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %277 = llvm.mul %205, %274 : i64 | |
| %278 = llvm.add %276, %277 : i64 | |
| %279 = llvm.mul %207, %275 : i64 | |
| %280 = llvm.add %278, %279 : i64 | |
| %281 = llvm.insertvalue %280, %273[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %282 = llvm.mlir.constant(1 : i64) : i64 | |
| %283 = llvm.insertvalue %243, %281[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %284 = llvm.insertvalue %282, %283[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %285 = llvm.mlir.constant(10 : i64) : i64 | |
| %286 = llvm.insertvalue %214, %284[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %287 = llvm.insertvalue %285, %286[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%288: i64): // 2 preds: ^bb11, ^bb24 | |
| %289 = llvm.icmp "slt" %288, %214 : i64 | |
| llvm.cond_br %289, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%290: i64): // 2 preds: ^bb12, ^bb23 | |
| %291 = llvm.icmp "slt" %290, %243 : i64 | |
| llvm.cond_br %291, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%292: i64): // 2 preds: ^bb13, ^bb22 | |
| %293 = llvm.icmp "slt" %292, %3 : i64 | |
| llvm.cond_br %293, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %294 = llvm.mlir.constant(4 : index) : i64 | |
| %295 = llvm.mlir.constant(-1 : index) : i64 | |
| %296 = llvm.mul %288, %295 : i64 | |
| %297 = llvm.add %214, %296 : i64 | |
| %298 = llvm.icmp "slt" %294, %297 : i64 | |
| %299 = llvm.select %298, %294, %297 : i1, i64 | |
| %300 = llvm.mlir.constant(4 : index) : i64 | |
| %301 = llvm.mlir.constant(-1 : index) : i64 | |
| %302 = llvm.mul %292, %301 : i64 | |
| %303 = llvm.mlir.constant(10 : index) : i64 | |
| %304 = llvm.add %302, %303 : i64 | |
| %305 = llvm.icmp "slt" %300, %304 : i64 | |
| %306 = llvm.select %305, %300, %304 : i1, i64 | |
| %307 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %308 = llvm.extractvalue %237[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %309 = llvm.bitcast %308 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %310 = llvm.insertvalue %309, %307[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %311 = llvm.extractvalue %237[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %312 = llvm.bitcast %311 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %313 = llvm.insertvalue %312, %310[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %314 = llvm.extractvalue %237[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %315 = llvm.extractvalue %237[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %316 = llvm.extractvalue %237[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %317 = llvm.mul %288, %314 : i64 | |
| %318 = llvm.add %316, %317 : i64 | |
| %319 = llvm.mul %292, %315 : i64 | |
| %320 = llvm.add %318, %319 : i64 | |
| %321 = llvm.insertvalue %320, %313[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %322 = llvm.mlir.constant(1 : i64) : i64 | |
| %323 = llvm.insertvalue %306, %321[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %324 = llvm.insertvalue %322, %323[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %325 = llvm.mlir.constant(10 : i64) : i64 | |
| %326 = llvm.insertvalue %299, %324[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %327 = llvm.insertvalue %325, %326[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %328 = llvm.mlir.constant(4 : index) : i64 | |
| %329 = llvm.mlir.constant(-1 : index) : i64 | |
| %330 = llvm.mul %290, %329 : i64 | |
| %331 = llvm.add %243, %330 : i64 | |
| %332 = llvm.icmp "slt" %328, %331 : i64 | |
| %333 = llvm.select %332, %328, %331 : i1, i64 | |
| %334 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %335 = llvm.extractvalue %266[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %336 = llvm.bitcast %335 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %337 = llvm.insertvalue %336, %334[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %338 = llvm.extractvalue %266[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %339 = llvm.bitcast %338 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %340 = llvm.insertvalue %339, %337[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %341 = llvm.extractvalue %266[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %342 = llvm.extractvalue %266[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %343 = llvm.extractvalue %266[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %344 = llvm.mul %292, %341 : i64 | |
| %345 = llvm.add %343, %344 : i64 | |
| %346 = llvm.mul %290, %342 : i64 | |
| %347 = llvm.add %345, %346 : i64 | |
| %348 = llvm.insertvalue %347, %340[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %349 = llvm.mlir.constant(1 : i64) : i64 | |
| %350 = llvm.insertvalue %333, %348[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %351 = llvm.insertvalue %349, %350[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %352 = llvm.mlir.constant(10 : i64) : i64 | |
| %353 = llvm.insertvalue %306, %351[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %354 = llvm.insertvalue %352, %353[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %355 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %356 = llvm.extractvalue %287[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %357 = llvm.bitcast %356 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %358 = llvm.insertvalue %357, %355[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %359 = llvm.extractvalue %287[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %360 = llvm.bitcast %359 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %361 = llvm.insertvalue %360, %358[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %362 = llvm.extractvalue %287[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %363 = llvm.extractvalue %287[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %364 = llvm.extractvalue %287[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %365 = llvm.mul %288, %362 : i64 | |
| %366 = llvm.add %364, %365 : i64 | |
| %367 = llvm.mul %290, %363 : i64 | |
| %368 = llvm.add %366, %367 : i64 | |
| %369 = llvm.insertvalue %368, %361[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %370 = llvm.mlir.constant(1 : i64) : i64 | |
| %371 = llvm.insertvalue %333, %369[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %372 = llvm.insertvalue %370, %371[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %373 = llvm.mlir.constant(10 : i64) : i64 | |
| %374 = llvm.insertvalue %299, %372[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %375 = llvm.insertvalue %373, %374[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%376: i64): // 2 preds: ^bb15, ^bb21 | |
| %377 = llvm.icmp "slt" %376, %299 : i64 | |
| llvm.cond_br %377, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%378: i64): // 2 preds: ^bb16, ^bb20 | |
| %379 = llvm.icmp "slt" %378, %333 : i64 | |
| llvm.cond_br %379, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%380: i64): // 2 preds: ^bb17, ^bb19 | |
| %381 = llvm.icmp "slt" %380, %306 : i64 | |
| llvm.cond_br %381, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %382 = llvm.extractvalue %327[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %383 = llvm.extractvalue %327[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %384 = llvm.mlir.constant(10 : index) : i64 | |
| %385 = llvm.mul %376, %384 : i64 | |
| %386 = llvm.add %383, %385 : i64 | |
| %387 = llvm.add %386, %380 : i64 | |
| %388 = llvm.getelementptr %382[%387] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %389 = llvm.load %388 : !llvm.ptr<f32> | |
| %390 = llvm.extractvalue %354[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %391 = llvm.extractvalue %354[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %392 = llvm.mlir.constant(10 : index) : i64 | |
| %393 = llvm.mul %380, %392 : i64 | |
| %394 = llvm.add %391, %393 : i64 | |
| %395 = llvm.add %394, %378 : i64 | |
| %396 = llvm.getelementptr %390[%395] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %397 = llvm.load %396 : !llvm.ptr<f32> | |
| %398 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %399 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %400 = llvm.mlir.constant(10 : index) : i64 | |
| %401 = llvm.mul %376, %400 : i64 | |
| %402 = llvm.add %399, %401 : i64 | |
| %403 = llvm.add %402, %378 : i64 | |
| %404 = llvm.getelementptr %398[%403] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %405 = llvm.load %404 : !llvm.ptr<f32> | |
| %406 = llvm.fmul %389, %397 : f32 | |
| %407 = llvm.fadd %405, %406 : f32 | |
| %408 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %409 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %410 = llvm.mlir.constant(10 : index) : i64 | |
| %411 = llvm.mul %376, %410 : i64 | |
| %412 = llvm.add %409, %411 : i64 | |
| %413 = llvm.add %412, %378 : i64 | |
| %414 = llvm.getelementptr %408[%413] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %407, %414 : !llvm.ptr<f32> | |
| %415 = llvm.add %380, %6 : i64 | |
| llvm.br ^bb18(%415 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %416 = llvm.add %378, %6 : i64 | |
| llvm.br ^bb17(%416 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %417 = llvm.add %376, %6 : i64 | |
| llvm.br ^bb16(%417 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %418 = llvm.add %292, %0 : i64 | |
| llvm.br ^bb14(%418 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %419 = llvm.add %290, %0 : i64 | |
| llvm.br ^bb13(%419 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %420 = llvm.add %288, %0 : i64 | |
| llvm.br ^bb12(%420 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %421 = llvm.add %207, %1 : i64 | |
| llvm.br ^bb10(%421 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %422 = llvm.add %205, %1 : i64 | |
| llvm.br ^bb9(%422 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %423 = llvm.add %91, %90 : i64 | |
| llvm.br ^bb3(%423 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %424 = llvm.add %85, %84 : i64 | |
| llvm.br ^bb1(%424 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %425 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %425 : i32 | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.mlir.constant(0 : index) : i64 | |
| %13 = llvm.mlir.constant(10 : index) : i64 | |
| %14 = llvm.mlir.constant(1 : index) : i64 | |
| %15 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %16 = llvm.extractvalue %15[0] : !llvm.array<3 x i32> | |
| %17 = llvm.zext %16 : i32 to i64 | |
| %18 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %19 = llvm.extractvalue %18[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %20 = llvm.extractvalue %19[0] : !llvm.array<3 x i32> | |
| %21 = llvm.zext %20 : i32 to i64 | |
| %22 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %23 = llvm.extractvalue %22[1] : !llvm.array<3 x i32> | |
| %24 = llvm.zext %23 : i32 to i64 | |
| %25 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %26 = llvm.extractvalue %25[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %27 = llvm.extractvalue %26[1] : !llvm.array<3 x i32> | |
| %28 = llvm.zext %27 : i32 to i64 | |
| %29 = llvm.mlir.constant(64 : index) : i64 | |
| %30 = llvm.mul %24, %29 : i64 | |
| %31 = llvm.mlir.constant(64 : index) : i64 | |
| %32 = llvm.mul %28, %31 : i64 | |
| llvm.br ^bb1(%30 : i64) | |
| ^bb1(%33: i64): // 2 preds: ^bb0, ^bb10 | |
| %34 = llvm.icmp "slt" %33, %1 : i64 | |
| llvm.cond_br %34, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %35 = llvm.mlir.constant(64 : index) : i64 | |
| %36 = llvm.mul %17, %35 : i64 | |
| %37 = llvm.mlir.constant(64 : index) : i64 | |
| %38 = llvm.mul %21, %37 : i64 | |
| llvm.br ^bb3(%36 : i64) | |
| ^bb3(%39: i64): // 2 preds: ^bb2, ^bb9 | |
| %40 = llvm.icmp "slt" %39, %1 : i64 | |
| llvm.cond_br %40, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %41 = llvm.mlir.constant(64 : index) : i64 | |
| %42 = llvm.mlir.constant(-1 : index) : i64 | |
| %43 = llvm.mul %33, %42 : i64 | |
| %44 = llvm.mlir.constant(10 : index) : i64 | |
| %45 = llvm.add %43, %44 : i64 | |
| %46 = llvm.icmp "slt" %41, %45 : i64 | |
| %47 = llvm.select %46, %41, %45 : i1, i64 | |
| %48 = llvm.mlir.constant(64 : index) : i64 | |
| %49 = llvm.mlir.constant(-1 : index) : i64 | |
| %50 = llvm.mul %39, %49 : i64 | |
| %51 = llvm.mlir.constant(10 : index) : i64 | |
| %52 = llvm.add %50, %51 : i64 | |
| %53 = llvm.icmp "slt" %48, %52 : i64 | |
| %54 = llvm.select %53, %48, %52 : i1, i64 | |
| %55 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %56 = llvm.mul %33, %13 : i64 | |
| %57 = llvm.add %12, %56 : i64 | |
| %58 = llvm.mul %39, %14 : i64 | |
| %59 = llvm.add %57, %58 : i64 | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%60: i64): // 2 preds: ^bb4, ^bb8 | |
| %61 = llvm.icmp "slt" %60, %47 : i64 | |
| llvm.cond_br %61, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%62: i64): // 2 preds: ^bb5, ^bb7 | |
| %63 = llvm.icmp "slt" %62, %54 : i64 | |
| llvm.cond_br %63, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %64 = llvm.add %60, %33 : i64 | |
| %65 = llvm.add %62, %39 : i64 | |
| %66 = llvm.icmp "eq" %64, %65 : i64 | |
| %67 = llvm.select %66, %2, %3 : i1, f32 | |
| %68 = llvm.mlir.constant(10 : index) : i64 | |
| %69 = llvm.mul %60, %68 : i64 | |
| %70 = llvm.add %59, %69 : i64 | |
| %71 = llvm.add %70, %62 : i64 | |
| %72 = llvm.getelementptr %55[%71] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %67, %72 : !llvm.ptr<f32> | |
| %73 = llvm.add %62, %4 : i64 | |
| llvm.br ^bb6(%73 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %74 = llvm.add %60, %4 : i64 | |
| llvm.br ^bb5(%74 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %75 = llvm.add %39, %38 : i64 | |
| llvm.br ^bb3(%75 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %76 = llvm.add %33, %32 : i64 | |
| llvm.br ^bb1(%76 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %77 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %77 : i32 | |
| } | |
| } | |
| // -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %17 = llvm.mlir.constant(0 : index) : i64 | |
| %18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %19 = llvm.mlir.constant(10 : index) : i64 | |
| %20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %21 = llvm.mlir.constant(10 : index) : i64 | |
| %22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %23 = llvm.mlir.constant(10 : index) : i64 | |
| %24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %25 = llvm.mlir.constant(1 : index) : i64 | |
| %26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %29 = llvm.mlir.constant(1 : i64) : i64 | |
| %30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %31 = llvm.load %30 : !llvm.ptr<ptr<i8>> | |
| %32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %34 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %35 = llvm.insertvalue %33, %34[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %36 = llvm.insertvalue %33, %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %37 = llvm.mlir.constant(0 : index) : i64 | |
| %38 = llvm.insertvalue %37, %36[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %39 = llvm.mlir.constant(10 : index) : i64 | |
| %40 = llvm.insertvalue %39, %38[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %41 = llvm.mlir.constant(10 : index) : i64 | |
| %42 = llvm.insertvalue %41, %40[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %43 = llvm.mlir.constant(10 : index) : i64 | |
| %44 = llvm.insertvalue %43, %42[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %45 = llvm.mlir.constant(1 : index) : i64 | |
| %46 = llvm.insertvalue %45, %44[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %48 = llvm.extractvalue %47[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %49 = llvm.mlir.constant(2 : i64) : i64 | |
| %50 = llvm.getelementptr %48[%49] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %51 = llvm.load %50 : !llvm.ptr<ptr<i8>> | |
| %52 = llvm.getelementptr %51[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %53 = llvm.bitcast %52 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %54 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %55 = llvm.insertvalue %53, %54[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %56 = llvm.insertvalue %53, %55[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %57 = llvm.mlir.constant(0 : index) : i64 | |
| %58 = llvm.insertvalue %57, %56[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %59 = llvm.mlir.constant(10 : index) : i64 | |
| %60 = llvm.insertvalue %59, %58[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %61 = llvm.mlir.constant(10 : index) : i64 | |
| %62 = llvm.insertvalue %61, %60[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %63 = llvm.mlir.constant(10 : index) : i64 | |
| %64 = llvm.insertvalue %63, %62[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %65 = llvm.mlir.constant(1 : index) : i64 | |
| %66 = llvm.insertvalue %65, %64[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %67 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %68 = llvm.extractvalue %67[0] : !llvm.array<3 x i32> | |
| %69 = llvm.zext %68 : i32 to i64 | |
| %70 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %71 = llvm.extractvalue %70[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %72 = llvm.extractvalue %71[0] : !llvm.array<3 x i32> | |
| %73 = llvm.zext %72 : i32 to i64 | |
| %74 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %75 = llvm.extractvalue %74[1] : !llvm.array<3 x i32> | |
| %76 = llvm.zext %75 : i32 to i64 | |
| %77 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %78 = llvm.extractvalue %77[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %79 = llvm.extractvalue %78[1] : !llvm.array<3 x i32> | |
| %80 = llvm.zext %79 : i32 to i64 | |
| %81 = llvm.mlir.constant(64 : index) : i64 | |
| %82 = llvm.mul %76, %81 : i64 | |
| %83 = llvm.mlir.constant(64 : index) : i64 | |
| %84 = llvm.mul %80, %83 : i64 | |
| llvm.br ^bb1(%82 : i64) | |
| ^bb1(%85: i64): // 2 preds: ^bb0, ^bb28 | |
| %86 = llvm.icmp "slt" %85, %3 : i64 | |
| llvm.cond_br %86, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %87 = llvm.mlir.constant(64 : index) : i64 | |
| %88 = llvm.mul %69, %87 : i64 | |
| %89 = llvm.mlir.constant(64 : index) : i64 | |
| %90 = llvm.mul %73, %89 : i64 | |
| llvm.br ^bb3(%88 : i64) | |
| ^bb3(%91: i64): // 2 preds: ^bb2, ^bb27 | |
| %92 = llvm.icmp "slt" %91, %3 : i64 | |
| llvm.cond_br %92, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %93 = llvm.mlir.constant(64 : index) : i64 | |
| %94 = llvm.mlir.constant(-1 : index) : i64 | |
| %95 = llvm.mul %85, %94 : i64 | |
| %96 = llvm.mlir.constant(10 : index) : i64 | |
| %97 = llvm.add %95, %96 : i64 | |
| %98 = llvm.icmp "slt" %93, %97 : i64 | |
| %99 = llvm.select %98, %93, %97 : i1, i64 | |
| %100 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %101 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %102 = llvm.bitcast %101 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %103 = llvm.insertvalue %102, %100[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %104 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %105 = llvm.bitcast %104 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %106 = llvm.insertvalue %105, %103[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %107 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %108 = llvm.extractvalue %26[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %109 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %110 = llvm.mul %85, %107 : i64 | |
| %111 = llvm.add %109, %110 : i64 | |
| %112 = llvm.mlir.constant(0 : i64) : i64 | |
| %113 = llvm.mul %112, %108 : i64 | |
| %114 = llvm.add %111, %113 : i64 | |
| %115 = llvm.insertvalue %114, %106[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %116 = llvm.mlir.constant(10 : i64) : i64 | |
| %117 = llvm.mlir.constant(1 : i64) : i64 | |
| %118 = llvm.insertvalue %116, %115[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %119 = llvm.insertvalue %117, %118[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %120 = llvm.mlir.constant(10 : i64) : i64 | |
| %121 = llvm.insertvalue %99, %119[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %123 = llvm.mlir.constant(64 : index) : i64 | |
| %124 = llvm.mlir.constant(-1 : index) : i64 | |
| %125 = llvm.mul %91, %124 : i64 | |
| %126 = llvm.mlir.constant(10 : index) : i64 | |
| %127 = llvm.add %125, %126 : i64 | |
| %128 = llvm.icmp "slt" %123, %127 : i64 | |
| %129 = llvm.select %128, %123, %127 : i1, i64 | |
| %130 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %131 = llvm.extractvalue %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %132 = llvm.bitcast %131 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %133 = llvm.insertvalue %132, %130[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %134 = llvm.extractvalue %46[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %135 = llvm.bitcast %134 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %136 = llvm.insertvalue %135, %133[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %137 = llvm.extractvalue %46[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %138 = llvm.extractvalue %46[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %139 = llvm.extractvalue %46[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %140 = llvm.mlir.constant(0 : i64) : i64 | |
| %141 = llvm.mul %140, %137 : i64 | |
| %142 = llvm.add %139, %141 : i64 | |
| %143 = llvm.mul %91, %138 : i64 | |
| %144 = llvm.add %142, %143 : i64 | |
| %145 = llvm.insertvalue %144, %136[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %146 = llvm.mlir.constant(1 : i64) : i64 | |
| %147 = llvm.insertvalue %129, %145[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %148 = llvm.insertvalue %146, %147[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %149 = llvm.mlir.constant(10 : i64) : i64 | |
| %150 = llvm.mlir.constant(10 : i64) : i64 | |
| %151 = llvm.insertvalue %149, %148[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %152 = llvm.insertvalue %150, %151[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %153 = llvm.mlir.constant(-1 : index) : i64 | |
| %154 = llvm.mul %85, %153 : i64 | |
| %155 = llvm.mlir.constant(10 : index) : i64 | |
| %156 = llvm.add %154, %155 : i64 | |
| %157 = llvm.mlir.constant(64 : index) : i64 | |
| %158 = llvm.icmp "slt" %156, %157 : i64 | |
| %159 = llvm.select %158, %156, %157 : i1, i64 | |
| %160 = llvm.mlir.constant(-1 : index) : i64 | |
| %161 = llvm.mul %91, %160 : i64 | |
| %162 = llvm.mlir.constant(10 : index) : i64 | |
| %163 = llvm.add %161, %162 : i64 | |
| %164 = llvm.mlir.constant(64 : index) : i64 | |
| %165 = llvm.icmp "slt" %163, %164 : i64 | |
| %166 = llvm.select %165, %163, %164 : i1, i64 | |
| %167 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %168 = llvm.extractvalue %66[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %169 = llvm.bitcast %168 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %170 = llvm.insertvalue %169, %167[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %171 = llvm.extractvalue %66[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %172 = llvm.bitcast %171 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %173 = llvm.insertvalue %172, %170[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %174 = llvm.extractvalue %66[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %175 = llvm.extractvalue %66[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %176 = llvm.extractvalue %66[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %177 = llvm.mul %85, %174 : i64 | |
| %178 = llvm.add %176, %177 : i64 | |
| %179 = llvm.mul %91, %175 : i64 | |
| %180 = llvm.add %178, %179 : i64 | |
| %181 = llvm.insertvalue %180, %173[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %182 = llvm.mlir.constant(1 : i64) : i64 | |
| %183 = llvm.insertvalue %129, %181[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %184 = llvm.insertvalue %182, %183[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %185 = llvm.mlir.constant(10 : i64) : i64 | |
| %186 = llvm.insertvalue %99, %184[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %187 = llvm.insertvalue %185, %186[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%188: i64): // 2 preds: ^bb4, ^bb8 | |
| %189 = llvm.icmp "slt" %188, %159 : i64 | |
| llvm.cond_br %189, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%190: i64): // 2 preds: ^bb5, ^bb7 | |
| %191 = llvm.icmp "slt" %190, %166 : i64 | |
| llvm.cond_br %191, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %192 = llvm.add %188, %85 : i64 | |
| %193 = llvm.add %190, %91 : i64 | |
| %194 = llvm.icmp "eq" %192, %193 : i64 | |
| %195 = llvm.select %194, %4, %5 : i1, f32 | |
| %196 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %197 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %198 = llvm.mlir.constant(10 : index) : i64 | |
| %199 = llvm.mul %188, %198 : i64 | |
| %200 = llvm.add %197, %199 : i64 | |
| %201 = llvm.add %200, %190 : i64 | |
| %202 = llvm.getelementptr %196[%201] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %195, %202 : !llvm.ptr<f32> | |
| %203 = llvm.add %190, %6 : i64 | |
| llvm.br ^bb6(%203 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %204 = llvm.add %188, %6 : i64 | |
| llvm.br ^bb5(%204 : i64) | |
| ^bb9(%205: i64): // 2 preds: ^bb5, ^bb26 | |
| %206 = llvm.icmp "slt" %205, %99 : i64 | |
| llvm.cond_br %206, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%207: i64): // 2 preds: ^bb9, ^bb25 | |
| %208 = llvm.icmp "slt" %207, %129 : i64 | |
| llvm.cond_br %208, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %209 = llvm.mlir.constant(32 : index) : i64 | |
| %210 = llvm.mlir.constant(-1 : index) : i64 | |
| %211 = llvm.mul %205, %210 : i64 | |
| %212 = llvm.add %99, %211 : i64 | |
| %213 = llvm.icmp "slt" %209, %212 : i64 | |
| %214 = llvm.select %213, %209, %212 : i1, i64 | |
| %215 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %216 = llvm.extractvalue %122[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %217 = llvm.bitcast %216 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %218 = llvm.insertvalue %217, %215[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %219 = llvm.extractvalue %122[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %220 = llvm.bitcast %219 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %221 = llvm.insertvalue %220, %218[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %222 = llvm.extractvalue %122[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %223 = llvm.extractvalue %122[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %224 = llvm.extractvalue %122[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %225 = llvm.mul %205, %222 : i64 | |
| %226 = llvm.add %224, %225 : i64 | |
| %227 = llvm.mlir.constant(0 : i64) : i64 | |
| %228 = llvm.mul %227, %223 : i64 | |
| %229 = llvm.add %226, %228 : i64 | |
| %230 = llvm.insertvalue %229, %221[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %231 = llvm.mlir.constant(10 : i64) : i64 | |
| %232 = llvm.mlir.constant(1 : i64) : i64 | |
| %233 = llvm.insertvalue %231, %230[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %234 = llvm.insertvalue %232, %233[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %235 = llvm.mlir.constant(10 : i64) : i64 | |
| %236 = llvm.insertvalue %214, %234[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %237 = llvm.insertvalue %235, %236[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %238 = llvm.mlir.constant(32 : index) : i64 | |
| %239 = llvm.mlir.constant(-1 : index) : i64 | |
| %240 = llvm.mul %207, %239 : i64 | |
| %241 = llvm.add %129, %240 : i64 | |
| %242 = llvm.icmp "slt" %238, %241 : i64 | |
| %243 = llvm.select %242, %238, %241 : i1, i64 | |
| %244 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %245 = llvm.extractvalue %152[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %246 = llvm.bitcast %245 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %247 = llvm.insertvalue %246, %244[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %248 = llvm.extractvalue %152[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %249 = llvm.bitcast %248 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %250 = llvm.insertvalue %249, %247[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %251 = llvm.extractvalue %152[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %252 = llvm.extractvalue %152[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %253 = llvm.extractvalue %152[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %254 = llvm.mlir.constant(0 : i64) : i64 | |
| %255 = llvm.mul %254, %251 : i64 | |
| %256 = llvm.add %253, %255 : i64 | |
| %257 = llvm.mul %207, %252 : i64 | |
| %258 = llvm.add %256, %257 : i64 | |
| %259 = llvm.insertvalue %258, %250[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %260 = llvm.mlir.constant(1 : i64) : i64 | |
| %261 = llvm.insertvalue %243, %259[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %262 = llvm.insertvalue %260, %261[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %263 = llvm.mlir.constant(10 : i64) : i64 | |
| %264 = llvm.mlir.constant(10 : i64) : i64 | |
| %265 = llvm.insertvalue %263, %262[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %266 = llvm.insertvalue %264, %265[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %267 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %268 = llvm.extractvalue %187[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %269 = llvm.bitcast %268 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %270 = llvm.insertvalue %269, %267[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %271 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %272 = llvm.bitcast %271 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %273 = llvm.insertvalue %272, %270[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %274 = llvm.extractvalue %187[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %275 = llvm.extractvalue %187[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %276 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %277 = llvm.mul %205, %274 : i64 | |
| %278 = llvm.add %276, %277 : i64 | |
| %279 = llvm.mul %207, %275 : i64 | |
| %280 = llvm.add %278, %279 : i64 | |
| %281 = llvm.insertvalue %280, %273[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %282 = llvm.mlir.constant(1 : i64) : i64 | |
| %283 = llvm.insertvalue %243, %281[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %284 = llvm.insertvalue %282, %283[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %285 = llvm.mlir.constant(10 : i64) : i64 | |
| %286 = llvm.insertvalue %214, %284[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %287 = llvm.insertvalue %285, %286[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%288: i64): // 2 preds: ^bb11, ^bb24 | |
| %289 = llvm.icmp "slt" %288, %214 : i64 | |
| llvm.cond_br %289, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%290: i64): // 2 preds: ^bb12, ^bb23 | |
| %291 = llvm.icmp "slt" %290, %243 : i64 | |
| llvm.cond_br %291, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%292: i64): // 2 preds: ^bb13, ^bb22 | |
| %293 = llvm.icmp "slt" %292, %3 : i64 | |
| llvm.cond_br %293, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %294 = llvm.mlir.constant(4 : index) : i64 | |
| %295 = llvm.mlir.constant(-1 : index) : i64 | |
| %296 = llvm.mul %288, %295 : i64 | |
| %297 = llvm.add %214, %296 : i64 | |
| %298 = llvm.icmp "slt" %294, %297 : i64 | |
| %299 = llvm.select %298, %294, %297 : i1, i64 | |
| %300 = llvm.mlir.constant(4 : index) : i64 | |
| %301 = llvm.mlir.constant(-1 : index) : i64 | |
| %302 = llvm.mul %292, %301 : i64 | |
| %303 = llvm.mlir.constant(10 : index) : i64 | |
| %304 = llvm.add %302, %303 : i64 | |
| %305 = llvm.icmp "slt" %300, %304 : i64 | |
| %306 = llvm.select %305, %300, %304 : i1, i64 | |
| %307 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %308 = llvm.extractvalue %237[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %309 = llvm.bitcast %308 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %310 = llvm.insertvalue %309, %307[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %311 = llvm.extractvalue %237[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %312 = llvm.bitcast %311 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %313 = llvm.insertvalue %312, %310[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %314 = llvm.extractvalue %237[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %315 = llvm.extractvalue %237[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %316 = llvm.extractvalue %237[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %317 = llvm.mul %288, %314 : i64 | |
| %318 = llvm.add %316, %317 : i64 | |
| %319 = llvm.mul %292, %315 : i64 | |
| %320 = llvm.add %318, %319 : i64 | |
| %321 = llvm.insertvalue %320, %313[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %322 = llvm.mlir.constant(1 : i64) : i64 | |
| %323 = llvm.insertvalue %306, %321[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %324 = llvm.insertvalue %322, %323[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %325 = llvm.mlir.constant(10 : i64) : i64 | |
| %326 = llvm.insertvalue %299, %324[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %327 = llvm.insertvalue %325, %326[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %328 = llvm.mlir.constant(4 : index) : i64 | |
| %329 = llvm.mlir.constant(-1 : index) : i64 | |
| %330 = llvm.mul %290, %329 : i64 | |
| %331 = llvm.add %243, %330 : i64 | |
| %332 = llvm.icmp "slt" %328, %331 : i64 | |
| %333 = llvm.select %332, %328, %331 : i1, i64 | |
| %334 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %335 = llvm.extractvalue %266[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %336 = llvm.bitcast %335 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %337 = llvm.insertvalue %336, %334[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %338 = llvm.extractvalue %266[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %339 = llvm.bitcast %338 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %340 = llvm.insertvalue %339, %337[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %341 = llvm.extractvalue %266[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %342 = llvm.extractvalue %266[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %343 = llvm.extractvalue %266[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %344 = llvm.mul %292, %341 : i64 | |
| %345 = llvm.add %343, %344 : i64 | |
| %346 = llvm.mul %290, %342 : i64 | |
| %347 = llvm.add %345, %346 : i64 | |
| %348 = llvm.insertvalue %347, %340[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %349 = llvm.mlir.constant(1 : i64) : i64 | |
| %350 = llvm.insertvalue %333, %348[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %351 = llvm.insertvalue %349, %350[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %352 = llvm.mlir.constant(10 : i64) : i64 | |
| %353 = llvm.insertvalue %306, %351[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %354 = llvm.insertvalue %352, %353[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %355 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %356 = llvm.extractvalue %287[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %357 = llvm.bitcast %356 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %358 = llvm.insertvalue %357, %355[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %359 = llvm.extractvalue %287[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %360 = llvm.bitcast %359 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %361 = llvm.insertvalue %360, %358[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %362 = llvm.extractvalue %287[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %363 = llvm.extractvalue %287[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %364 = llvm.extractvalue %287[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %365 = llvm.mul %288, %362 : i64 | |
| %366 = llvm.add %364, %365 : i64 | |
| %367 = llvm.mul %290, %363 : i64 | |
| %368 = llvm.add %366, %367 : i64 | |
| %369 = llvm.insertvalue %368, %361[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %370 = llvm.mlir.constant(1 : i64) : i64 | |
| %371 = llvm.insertvalue %333, %369[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %372 = llvm.insertvalue %370, %371[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %373 = llvm.mlir.constant(10 : i64) : i64 | |
| %374 = llvm.insertvalue %299, %372[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %375 = llvm.insertvalue %373, %374[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%376: i64): // 2 preds: ^bb15, ^bb21 | |
| %377 = llvm.icmp "slt" %376, %299 : i64 | |
| llvm.cond_br %377, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%378: i64): // 2 preds: ^bb16, ^bb20 | |
| %379 = llvm.icmp "slt" %378, %333 : i64 | |
| llvm.cond_br %379, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%380: i64): // 2 preds: ^bb17, ^bb19 | |
| %381 = llvm.icmp "slt" %380, %306 : i64 | |
| llvm.cond_br %381, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %382 = llvm.extractvalue %327[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %383 = llvm.extractvalue %327[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %384 = llvm.mlir.constant(10 : index) : i64 | |
| %385 = llvm.mul %376, %384 : i64 | |
| %386 = llvm.add %383, %385 : i64 | |
| %387 = llvm.add %386, %380 : i64 | |
| %388 = llvm.getelementptr %382[%387] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %389 = llvm.load %388 : !llvm.ptr<f32> | |
| %390 = llvm.extractvalue %354[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %391 = llvm.extractvalue %354[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %392 = llvm.mlir.constant(10 : index) : i64 | |
| %393 = llvm.mul %380, %392 : i64 | |
| %394 = llvm.add %391, %393 : i64 | |
| %395 = llvm.add %394, %378 : i64 | |
| %396 = llvm.getelementptr %390[%395] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %397 = llvm.load %396 : !llvm.ptr<f32> | |
| %398 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %399 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %400 = llvm.mlir.constant(10 : index) : i64 | |
| %401 = llvm.mul %376, %400 : i64 | |
| %402 = llvm.add %399, %401 : i64 | |
| %403 = llvm.add %402, %378 : i64 | |
| %404 = llvm.getelementptr %398[%403] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %405 = llvm.load %404 : !llvm.ptr<f32> | |
| %406 = llvm.fmul %389, %397 : f32 | |
| %407 = llvm.fadd %405, %406 : f32 | |
| %408 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %409 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> | |
| %410 = llvm.mlir.constant(10 : index) : i64 | |
| %411 = llvm.mul %376, %410 : i64 | |
| %412 = llvm.add %409, %411 : i64 | |
| %413 = llvm.add %412, %378 : i64 | |
| %414 = llvm.getelementptr %408[%413] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %407, %414 : !llvm.ptr<f32> | |
| %415 = llvm.add %380, %6 : i64 | |
| llvm.br ^bb18(%415 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %416 = llvm.add %378, %6 : i64 | |
| llvm.br ^bb17(%416 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %417 = llvm.add %376, %6 : i64 | |
| llvm.br ^bb16(%417 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %418 = llvm.add %292, %0 : i64 | |
| llvm.br ^bb14(%418 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %419 = llvm.add %290, %0 : i64 | |
| llvm.br ^bb13(%419 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %420 = llvm.add %288, %0 : i64 | |
| llvm.br ^bb12(%420 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %421 = llvm.add %207, %1 : i64 | |
| llvm.br ^bb10(%421 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %422 = llvm.add %205, %1 : i64 | |
| llvm.br ^bb9(%422 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %423 = llvm.add %91, %90 : i64 | |
| llvm.br ^bb3(%423 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %424 = llvm.add %85, %84 : i64 | |
| llvm.br ^bb1(%424 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %425 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %425 : i32 | |
| } | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32> | |
| %14 = llvm.zext %13 : i32 to i64 | |
| %15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32> | |
| %18 = llvm.zext %17 : i32 to i64 | |
| %19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32> | |
| %21 = llvm.zext %20 : i32 to i64 | |
| %22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32> | |
| %25 = llvm.zext %24 : i32 to i64 | |
| %26 = llvm.mlir.constant(64 : index) : i64 | |
| %27 = llvm.mul %21, %26 : i64 | |
| %28 = llvm.mul %25, %26 : i64 | |
| llvm.br ^bb1(%27 : i64) | |
| ^bb1(%29: i64): // 2 preds: ^bb0, ^bb10 | |
| %30 = llvm.icmp "slt" %29, %1 : i64 | |
| llvm.cond_br %30, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %31 = llvm.mul %14, %26 : i64 | |
| %32 = llvm.mul %18, %26 : i64 | |
| llvm.br ^bb3(%31 : i64) | |
| ^bb3(%33: i64): // 2 preds: ^bb2, ^bb9 | |
| %34 = llvm.icmp "slt" %33, %1 : i64 | |
| llvm.cond_br %34, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %35 = llvm.mlir.constant(-1 : index) : i64 | |
| %36 = llvm.mul %29, %35 : i64 | |
| %37 = llvm.add %36, %1 : i64 | |
| %38 = llvm.icmp "slt" %26, %37 : i64 | |
| %39 = llvm.select %38, %26, %37 : i1, i64 | |
| %40 = llvm.mul %33, %35 : i64 | |
| %41 = llvm.add %40, %1 : i64 | |
| %42 = llvm.icmp "slt" %26, %41 : i64 | |
| %43 = llvm.select %42, %26, %41 : i1, i64 | |
| %44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %45 = llvm.mul %29, %1 : i64 | |
| %46 = llvm.add %0, %45 : i64 | |
| %47 = llvm.mul %33, %4 : i64 | |
| %48 = llvm.add %46, %47 : i64 | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%49: i64): // 2 preds: ^bb4, ^bb8 | |
| %50 = llvm.icmp "slt" %49, %39 : i64 | |
| llvm.cond_br %50, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%51: i64): // 2 preds: ^bb5, ^bb7 | |
| %52 = llvm.icmp "slt" %51, %43 : i64 | |
| llvm.cond_br %52, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %53 = llvm.add %49, %29 : i64 | |
| %54 = llvm.add %51, %33 : i64 | |
| %55 = llvm.icmp "eq" %53, %54 : i64 | |
| %56 = llvm.select %55, %2, %3 : i1, f32 | |
| %57 = llvm.mul %49, %1 : i64 | |
| %58 = llvm.add %48, %57 : i64 | |
| %59 = llvm.add %58, %51 : i64 | |
| %60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %56, %60 : !llvm.ptr<f32> | |
| %61 = llvm.add %51, %4 : i64 | |
| llvm.br ^bb6(%61 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %62 = llvm.add %49, %4 : i64 | |
| llvm.br ^bb5(%62 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %63 = llvm.add %33, %32 : i64 | |
| llvm.br ^bb3(%63 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %64 = llvm.add %29, %28 : i64 | |
| llvm.br ^bb1(%64 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %65 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %65 : i32 | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.mlir.constant(0 : index) : i64 | |
| %15 = llvm.mlir.constant(10 : index) : i64 | |
| %16 = llvm.mlir.constant(1 : index) : i64 | |
| %17 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %18 = llvm.extractvalue %17[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %19 = llvm.mlir.constant(1 : i64) : i64 | |
| %20 = llvm.getelementptr %18[%19] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %21 = llvm.load %20 : !llvm.ptr<ptr<i8>> | |
| %22 = llvm.getelementptr %21[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %23 = llvm.bitcast %22 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %24 = llvm.mlir.constant(0 : index) : i64 | |
| %25 = llvm.mlir.constant(10 : index) : i64 | |
| %26 = llvm.mlir.constant(1 : index) : i64 | |
| %27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %29 = llvm.mlir.constant(2 : i64) : i64 | |
| %30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %31 = llvm.load %30 : !llvm.ptr<ptr<i8>> | |
| %32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %34 = llvm.mlir.constant(0 : index) : i64 | |
| %35 = llvm.mlir.constant(10 : index) : i64 | |
| %36 = llvm.mlir.constant(1 : index) : i64 | |
| %37 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %38 = llvm.extractvalue %37[0] : !llvm.array<3 x i32> | |
| %39 = llvm.zext %38 : i32 to i64 | |
| %40 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %41 = llvm.extractvalue %40[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %42 = llvm.extractvalue %41[0] : !llvm.array<3 x i32> | |
| %43 = llvm.zext %42 : i32 to i64 | |
| %44 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %45 = llvm.extractvalue %44[1] : !llvm.array<3 x i32> | |
| %46 = llvm.zext %45 : i32 to i64 | |
| %47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %48 = llvm.extractvalue %47[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %49 = llvm.extractvalue %48[1] : !llvm.array<3 x i32> | |
| %50 = llvm.zext %49 : i32 to i64 | |
| %51 = llvm.mlir.constant(64 : index) : i64 | |
| %52 = llvm.mul %46, %51 : i64 | |
| %53 = llvm.mlir.constant(64 : index) : i64 | |
| %54 = llvm.mul %50, %53 : i64 | |
| llvm.br ^bb1(%52 : i64) | |
| ^bb1(%55: i64): // 2 preds: ^bb0, ^bb28 | |
| %56 = llvm.icmp "slt" %55, %3 : i64 | |
| llvm.cond_br %56, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %57 = llvm.mlir.constant(64 : index) : i64 | |
| %58 = llvm.mul %39, %57 : i64 | |
| %59 = llvm.mlir.constant(64 : index) : i64 | |
| %60 = llvm.mul %43, %59 : i64 | |
| llvm.br ^bb3(%58 : i64) | |
| ^bb3(%61: i64): // 2 preds: ^bb2, ^bb27 | |
| %62 = llvm.icmp "slt" %61, %3 : i64 | |
| llvm.cond_br %62, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %63 = llvm.mlir.constant(64 : index) : i64 | |
| %64 = llvm.mlir.constant(-1 : index) : i64 | |
| %65 = llvm.mul %55, %64 : i64 | |
| %66 = llvm.mlir.constant(10 : index) : i64 | |
| %67 = llvm.add %65, %66 : i64 | |
| %68 = llvm.icmp "slt" %63, %67 : i64 | |
| %69 = llvm.select %68, %63, %67 : i1, i64 | |
| %70 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %71 = llvm.mul %55, %15 : i64 | |
| %72 = llvm.add %14, %71 : i64 | |
| %73 = llvm.mlir.constant(0 : i64) : i64 | |
| %74 = llvm.mul %73, %16 : i64 | |
| %75 = llvm.add %72, %74 : i64 | |
| %76 = llvm.mlir.constant(1 : i64) : i64 | |
| %77 = llvm.mlir.constant(10 : i64) : i64 | |
| %78 = llvm.mlir.constant(64 : index) : i64 | |
| %79 = llvm.mlir.constant(-1 : index) : i64 | |
| %80 = llvm.mul %61, %79 : i64 | |
| %81 = llvm.mlir.constant(10 : index) : i64 | |
| %82 = llvm.add %80, %81 : i64 | |
| %83 = llvm.icmp "slt" %78, %82 : i64 | |
| %84 = llvm.select %83, %78, %82 : i1, i64 | |
| %85 = llvm.bitcast %23 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %86 = llvm.mlir.constant(0 : i64) : i64 | |
| %87 = llvm.mul %86, %25 : i64 | |
| %88 = llvm.add %24, %87 : i64 | |
| %89 = llvm.mul %61, %26 : i64 | |
| %90 = llvm.add %88, %89 : i64 | |
| %91 = llvm.mlir.constant(1 : i64) : i64 | |
| %92 = llvm.mlir.constant(10 : i64) : i64 | |
| %93 = llvm.mlir.constant(-1 : index) : i64 | |
| %94 = llvm.mul %55, %93 : i64 | |
| %95 = llvm.mlir.constant(10 : index) : i64 | |
| %96 = llvm.add %94, %95 : i64 | |
| %97 = llvm.mlir.constant(64 : index) : i64 | |
| %98 = llvm.icmp "slt" %96, %97 : i64 | |
| %99 = llvm.select %98, %96, %97 : i1, i64 | |
| %100 = llvm.mlir.constant(-1 : index) : i64 | |
| %101 = llvm.mul %61, %100 : i64 | |
| %102 = llvm.mlir.constant(10 : index) : i64 | |
| %103 = llvm.add %101, %102 : i64 | |
| %104 = llvm.mlir.constant(64 : index) : i64 | |
| %105 = llvm.icmp "slt" %103, %104 : i64 | |
| %106 = llvm.select %105, %103, %104 : i1, i64 | |
| %107 = llvm.bitcast %33 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %108 = llvm.mul %55, %35 : i64 | |
| %109 = llvm.add %34, %108 : i64 | |
| %110 = llvm.mul %61, %36 : i64 | |
| %111 = llvm.add %109, %110 : i64 | |
| %112 = llvm.mlir.constant(1 : i64) : i64 | |
| %113 = llvm.mlir.constant(10 : i64) : i64 | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%114: i64): // 2 preds: ^bb4, ^bb8 | |
| %115 = llvm.icmp "slt" %114, %99 : i64 | |
| llvm.cond_br %115, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%116: i64): // 2 preds: ^bb5, ^bb7 | |
| %117 = llvm.icmp "slt" %116, %106 : i64 | |
| llvm.cond_br %117, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %118 = llvm.add %114, %55 : i64 | |
| %119 = llvm.add %116, %61 : i64 | |
| %120 = llvm.icmp "eq" %118, %119 : i64 | |
| %121 = llvm.select %120, %4, %5 : i1, f32 | |
| %122 = llvm.mlir.constant(10 : index) : i64 | |
| %123 = llvm.mul %114, %122 : i64 | |
| %124 = llvm.add %111, %123 : i64 | |
| %125 = llvm.add %124, %116 : i64 | |
| %126 = llvm.getelementptr %107[%125] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %121, %126 : !llvm.ptr<f32> | |
| %127 = llvm.add %116, %6 : i64 | |
| llvm.br ^bb6(%127 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %128 = llvm.add %114, %6 : i64 | |
| llvm.br ^bb5(%128 : i64) | |
| ^bb9(%129: i64): // 2 preds: ^bb5, ^bb26 | |
| %130 = llvm.icmp "slt" %129, %69 : i64 | |
| llvm.cond_br %130, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%131: i64): // 2 preds: ^bb9, ^bb25 | |
| %132 = llvm.icmp "slt" %131, %84 : i64 | |
| llvm.cond_br %132, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %133 = llvm.mlir.constant(32 : index) : i64 | |
| %134 = llvm.mlir.constant(-1 : index) : i64 | |
| %135 = llvm.mul %129, %134 : i64 | |
| %136 = llvm.add %69, %135 : i64 | |
| %137 = llvm.icmp "slt" %133, %136 : i64 | |
| %138 = llvm.select %137, %133, %136 : i1, i64 | |
| %139 = llvm.bitcast %70 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %140 = llvm.mul %129, %77 : i64 | |
| %141 = llvm.add %75, %140 : i64 | |
| %142 = llvm.mlir.constant(0 : i64) : i64 | |
| %143 = llvm.mul %142, %76 : i64 | |
| %144 = llvm.add %141, %143 : i64 | |
| %145 = llvm.mlir.constant(1 : i64) : i64 | |
| %146 = llvm.mlir.constant(10 : i64) : i64 | |
| %147 = llvm.mlir.constant(32 : index) : i64 | |
| %148 = llvm.mlir.constant(-1 : index) : i64 | |
| %149 = llvm.mul %131, %148 : i64 | |
| %150 = llvm.add %84, %149 : i64 | |
| %151 = llvm.icmp "slt" %147, %150 : i64 | |
| %152 = llvm.select %151, %147, %150 : i1, i64 | |
| %153 = llvm.bitcast %85 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %154 = llvm.mlir.constant(0 : i64) : i64 | |
| %155 = llvm.mul %154, %92 : i64 | |
| %156 = llvm.add %90, %155 : i64 | |
| %157 = llvm.mul %131, %91 : i64 | |
| %158 = llvm.add %156, %157 : i64 | |
| %159 = llvm.mlir.constant(1 : i64) : i64 | |
| %160 = llvm.mlir.constant(10 : i64) : i64 | |
| %161 = llvm.bitcast %107 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %162 = llvm.mul %129, %113 : i64 | |
| %163 = llvm.add %111, %162 : i64 | |
| %164 = llvm.mul %131, %112 : i64 | |
| %165 = llvm.add %163, %164 : i64 | |
| %166 = llvm.mlir.constant(1 : i64) : i64 | |
| %167 = llvm.mlir.constant(10 : i64) : i64 | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%168: i64): // 2 preds: ^bb11, ^bb24 | |
| %169 = llvm.icmp "slt" %168, %138 : i64 | |
| llvm.cond_br %169, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%170: i64): // 2 preds: ^bb12, ^bb23 | |
| %171 = llvm.icmp "slt" %170, %152 : i64 | |
| llvm.cond_br %171, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%172: i64): // 2 preds: ^bb13, ^bb22 | |
| %173 = llvm.icmp "slt" %172, %3 : i64 | |
| llvm.cond_br %173, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %174 = llvm.mlir.constant(4 : index) : i64 | |
| %175 = llvm.mlir.constant(-1 : index) : i64 | |
| %176 = llvm.mul %168, %175 : i64 | |
| %177 = llvm.add %138, %176 : i64 | |
| %178 = llvm.icmp "slt" %174, %177 : i64 | |
| %179 = llvm.select %178, %174, %177 : i1, i64 | |
| %180 = llvm.mlir.constant(4 : index) : i64 | |
| %181 = llvm.mlir.constant(-1 : index) : i64 | |
| %182 = llvm.mul %172, %181 : i64 | |
| %183 = llvm.mlir.constant(10 : index) : i64 | |
| %184 = llvm.add %182, %183 : i64 | |
| %185 = llvm.icmp "slt" %180, %184 : i64 | |
| %186 = llvm.select %185, %180, %184 : i1, i64 | |
| %187 = llvm.bitcast %139 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %188 = llvm.mul %168, %146 : i64 | |
| %189 = llvm.add %144, %188 : i64 | |
| %190 = llvm.mul %172, %145 : i64 | |
| %191 = llvm.add %189, %190 : i64 | |
| %192 = llvm.mlir.constant(4 : index) : i64 | |
| %193 = llvm.mlir.constant(-1 : index) : i64 | |
| %194 = llvm.mul %170, %193 : i64 | |
| %195 = llvm.add %152, %194 : i64 | |
| %196 = llvm.icmp "slt" %192, %195 : i64 | |
| %197 = llvm.select %196, %192, %195 : i1, i64 | |
| %198 = llvm.bitcast %153 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %199 = llvm.mul %172, %160 : i64 | |
| %200 = llvm.add %158, %199 : i64 | |
| %201 = llvm.mul %170, %159 : i64 | |
| %202 = llvm.add %200, %201 : i64 | |
| %203 = llvm.bitcast %161 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %204 = llvm.mul %168, %167 : i64 | |
| %205 = llvm.add %165, %204 : i64 | |
| %206 = llvm.mul %170, %166 : i64 | |
| %207 = llvm.add %205, %206 : i64 | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%208: i64): // 2 preds: ^bb15, ^bb21 | |
| %209 = llvm.icmp "slt" %208, %179 : i64 | |
| llvm.cond_br %209, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%210: i64): // 2 preds: ^bb16, ^bb20 | |
| %211 = llvm.icmp "slt" %210, %197 : i64 | |
| llvm.cond_br %211, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%212: i64): // 2 preds: ^bb17, ^bb19 | |
| %213 = llvm.icmp "slt" %212, %186 : i64 | |
| llvm.cond_br %213, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %214 = llvm.mlir.constant(10 : index) : i64 | |
| %215 = llvm.mul %208, %214 : i64 | |
| %216 = llvm.add %191, %215 : i64 | |
| %217 = llvm.add %216, %212 : i64 | |
| %218 = llvm.getelementptr %187[%217] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %219 = llvm.load %218 : !llvm.ptr<f32> | |
| %220 = llvm.mlir.constant(10 : index) : i64 | |
| %221 = llvm.mul %212, %220 : i64 | |
| %222 = llvm.add %202, %221 : i64 | |
| %223 = llvm.add %222, %210 : i64 | |
| %224 = llvm.getelementptr %198[%223] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %225 = llvm.load %224 : !llvm.ptr<f32> | |
| %226 = llvm.mlir.constant(10 : index) : i64 | |
| %227 = llvm.mul %208, %226 : i64 | |
| %228 = llvm.add %207, %227 : i64 | |
| %229 = llvm.add %228, %210 : i64 | |
| %230 = llvm.getelementptr %203[%229] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %231 = llvm.load %230 : !llvm.ptr<f32> | |
| %232 = llvm.fmul %219, %225 : f32 | |
| %233 = llvm.fadd %231, %232 : f32 | |
| %234 = llvm.mlir.constant(10 : index) : i64 | |
| %235 = llvm.mul %208, %234 : i64 | |
| %236 = llvm.add %207, %235 : i64 | |
| %237 = llvm.add %236, %210 : i64 | |
| %238 = llvm.getelementptr %203[%237] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %233, %238 : !llvm.ptr<f32> | |
| %239 = llvm.add %212, %6 : i64 | |
| llvm.br ^bb18(%239 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %240 = llvm.add %210, %6 : i64 | |
| llvm.br ^bb17(%240 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %241 = llvm.add %208, %6 : i64 | |
| llvm.br ^bb16(%241 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %242 = llvm.add %172, %0 : i64 | |
| llvm.br ^bb14(%242 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %243 = llvm.add %170, %0 : i64 | |
| llvm.br ^bb13(%243 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %244 = llvm.add %168, %0 : i64 | |
| llvm.br ^bb12(%244 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %245 = llvm.add %131, %1 : i64 | |
| llvm.br ^bb10(%245 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %246 = llvm.add %129, %1 : i64 | |
| llvm.br ^bb9(%246 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %247 = llvm.add %61, %60 : i64 | |
| llvm.br ^bb3(%247 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %248 = llvm.add %55, %54 : i64 | |
| llvm.br ^bb1(%248 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %249 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %249 : i32 | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32> | |
| %14 = llvm.zext %13 : i32 to i64 | |
| %15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32> | |
| %18 = llvm.zext %17 : i32 to i64 | |
| %19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32> | |
| %21 = llvm.zext %20 : i32 to i64 | |
| %22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32> | |
| %25 = llvm.zext %24 : i32 to i64 | |
| %26 = llvm.mlir.constant(64 : index) : i64 | |
| %27 = llvm.mul %21, %26 : i64 | |
| %28 = llvm.mul %25, %26 : i64 | |
| llvm.br ^bb1(%27 : i64) | |
| ^bb1(%29: i64): // 2 preds: ^bb0, ^bb10 | |
| %30 = llvm.icmp "slt" %29, %1 : i64 | |
| llvm.cond_br %30, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %31 = llvm.mul %14, %26 : i64 | |
| %32 = llvm.mul %18, %26 : i64 | |
| llvm.br ^bb3(%31 : i64) | |
| ^bb3(%33: i64): // 2 preds: ^bb2, ^bb9 | |
| %34 = llvm.icmp "slt" %33, %1 : i64 | |
| llvm.cond_br %34, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %35 = llvm.mlir.constant(-1 : index) : i64 | |
| %36 = llvm.mul %29, %35 : i64 | |
| %37 = llvm.add %36, %1 : i64 | |
| %38 = llvm.icmp "slt" %26, %37 : i64 | |
| %39 = llvm.select %38, %26, %37 : i1, i64 | |
| %40 = llvm.mul %33, %35 : i64 | |
| %41 = llvm.add %40, %1 : i64 | |
| %42 = llvm.icmp "slt" %26, %41 : i64 | |
| %43 = llvm.select %42, %26, %41 : i1, i64 | |
| %44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %45 = llvm.mul %29, %1 : i64 | |
| %46 = llvm.add %0, %45 : i64 | |
| %47 = llvm.mul %33, %4 : i64 | |
| %48 = llvm.add %46, %47 : i64 | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%49: i64): // 2 preds: ^bb4, ^bb8 | |
| %50 = llvm.icmp "slt" %49, %39 : i64 | |
| llvm.cond_br %50, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%51: i64): // 2 preds: ^bb5, ^bb7 | |
| %52 = llvm.icmp "slt" %51, %43 : i64 | |
| llvm.cond_br %52, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %53 = llvm.add %49, %29 : i64 | |
| %54 = llvm.add %51, %33 : i64 | |
| %55 = llvm.icmp "eq" %53, %54 : i64 | |
| %56 = llvm.select %55, %2, %3 : i1, f32 | |
| %57 = llvm.mul %49, %1 : i64 | |
| %58 = llvm.add %48, %57 : i64 | |
| %59 = llvm.add %58, %51 : i64 | |
| %60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %56, %60 : !llvm.ptr<f32> | |
| %61 = llvm.add %51, %4 : i64 | |
| llvm.br ^bb6(%61 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %62 = llvm.add %49, %4 : i64 | |
| llvm.br ^bb5(%62 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %63 = llvm.add %33, %32 : i64 | |
| llvm.br ^bb3(%63 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %64 = llvm.add %29, %28 : i64 | |
| llvm.br ^bb1(%64 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %65 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %65 : i32 | |
| } | |
| } | |
| } | |
| // -----// IR Dump After CSE //----- // | |
| module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %16 = llvm.mlir.constant(1 : i64) : i64 | |
| %17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %18 = llvm.load %17 : !llvm.ptr<ptr<i8>> | |
| %19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %23 = llvm.mlir.constant(2 : i64) : i64 | |
| %24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %25 = llvm.load %24 : !llvm.ptr<ptr<i8>> | |
| %26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32> | |
| %30 = llvm.zext %29 : i32 to i64 | |
| %31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32> | |
| %34 = llvm.zext %33 : i32 to i64 | |
| %35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32> | |
| %37 = llvm.zext %36 : i32 to i64 | |
| %38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32> | |
| %41 = llvm.zext %40 : i32 to i64 | |
| %42 = llvm.mlir.constant(64 : index) : i64 | |
| %43 = llvm.mul %37, %42 : i64 | |
| %44 = llvm.mul %41, %42 : i64 | |
| llvm.br ^bb1(%43 : i64) | |
| ^bb1(%45: i64): // 2 preds: ^bb0, ^bb28 | |
| %46 = llvm.icmp "slt" %45, %3 : i64 | |
| llvm.cond_br %46, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %47 = llvm.mul %30, %42 : i64 | |
| %48 = llvm.mul %34, %42 : i64 | |
| llvm.br ^bb3(%47 : i64) | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb27 | |
| %50 = llvm.icmp "slt" %49, %3 : i64 | |
| llvm.cond_br %50, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %51 = llvm.mlir.constant(-1 : index) : i64 | |
| %52 = llvm.mul %45, %51 : i64 | |
| %53 = llvm.add %52, %3 : i64 | |
| %54 = llvm.icmp "slt" %42, %53 : i64 | |
| %55 = llvm.select %54, %42, %53 : i1, i64 | |
| %56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %57 = llvm.mul %45, %3 : i64 | |
| %58 = llvm.add %2, %57 : i64 | |
| %59 = llvm.mul %9, %6 : i64 | |
| %60 = llvm.add %58, %59 : i64 | |
| %61 = llvm.mlir.constant(10 : i64) : i64 | |
| %62 = llvm.mul %49, %51 : i64 | |
| %63 = llvm.add %62, %3 : i64 | |
| %64 = llvm.icmp "slt" %42, %63 : i64 | |
| %65 = llvm.select %64, %42, %63 : i1, i64 | |
| %66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %67 = llvm.mul %9, %3 : i64 | |
| %68 = llvm.add %2, %67 : i64 | |
| %69 = llvm.mul %49, %6 : i64 | |
| %70 = llvm.add %68, %69 : i64 | |
| %71 = llvm.icmp "slt" %53, %42 : i64 | |
| %72 = llvm.select %71, %53, %42 : i1, i64 | |
| %73 = llvm.icmp "slt" %63, %42 : i64 | |
| %74 = llvm.select %73, %63, %42 : i1, i64 | |
| %75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %76 = llvm.add %58, %69 : i64 | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%77: i64): // 2 preds: ^bb4, ^bb8 | |
| %78 = llvm.icmp "slt" %77, %72 : i64 | |
| llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%79: i64): // 2 preds: ^bb5, ^bb7 | |
| %80 = llvm.icmp "slt" %79, %74 : i64 | |
| llvm.cond_br %80, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %81 = llvm.add %77, %45 : i64 | |
| %82 = llvm.add %79, %49 : i64 | |
| %83 = llvm.icmp "eq" %81, %82 : i64 | |
| %84 = llvm.select %83, %4, %5 : i1, f32 | |
| %85 = llvm.mul %77, %3 : i64 | |
| %86 = llvm.add %76, %85 : i64 | |
| %87 = llvm.add %86, %79 : i64 | |
| %88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %84, %88 : !llvm.ptr<f32> | |
| %89 = llvm.add %79, %6 : i64 | |
| llvm.br ^bb6(%89 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %90 = llvm.add %77, %6 : i64 | |
| llvm.br ^bb5(%90 : i64) | |
| ^bb9(%91: i64): // 2 preds: ^bb5, ^bb26 | |
| %92 = llvm.icmp "slt" %91, %55 : i64 | |
| llvm.cond_br %92, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%93: i64): // 2 preds: ^bb9, ^bb25 | |
| %94 = llvm.icmp "slt" %93, %65 : i64 | |
| llvm.cond_br %94, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %95 = llvm.mul %91, %51 : i64 | |
| %96 = llvm.add %55, %95 : i64 | |
| %97 = llvm.icmp "slt" %1, %96 : i64 | |
| %98 = llvm.select %97, %1, %96 : i1, i64 | |
| %99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %100 = llvm.mul %91, %61 : i64 | |
| %101 = llvm.add %60, %100 : i64 | |
| %102 = llvm.mul %9, %16 : i64 | |
| %103 = llvm.add %101, %102 : i64 | |
| %104 = llvm.mul %93, %51 : i64 | |
| %105 = llvm.add %65, %104 : i64 | |
| %106 = llvm.icmp "slt" %1, %105 : i64 | |
| %107 = llvm.select %106, %1, %105 : i1, i64 | |
| %108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %109 = llvm.mul %9, %61 : i64 | |
| %110 = llvm.add %70, %109 : i64 | |
| %111 = llvm.mul %93, %16 : i64 | |
| %112 = llvm.add %110, %111 : i64 | |
| %113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %114 = llvm.add %76, %100 : i64 | |
| %115 = llvm.add %114, %111 : i64 | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%116: i64): // 2 preds: ^bb11, ^bb24 | |
| %117 = llvm.icmp "slt" %116, %98 : i64 | |
| llvm.cond_br %117, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%118: i64): // 2 preds: ^bb12, ^bb23 | |
| %119 = llvm.icmp "slt" %118, %107 : i64 | |
| llvm.cond_br %119, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%120: i64): // 2 preds: ^bb13, ^bb22 | |
| %121 = llvm.icmp "slt" %120, %3 : i64 | |
| llvm.cond_br %121, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %122 = llvm.mul %116, %51 : i64 | |
| %123 = llvm.add %98, %122 : i64 | |
| %124 = llvm.icmp "slt" %0, %123 : i64 | |
| %125 = llvm.select %124, %0, %123 : i1, i64 | |
| %126 = llvm.mul %120, %51 : i64 | |
| %127 = llvm.add %126, %3 : i64 | |
| %128 = llvm.icmp "slt" %0, %127 : i64 | |
| %129 = llvm.select %128, %0, %127 : i1, i64 | |
| %130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %131 = llvm.mul %116, %61 : i64 | |
| %132 = llvm.add %103, %131 : i64 | |
| %133 = llvm.mul %120, %16 : i64 | |
| %134 = llvm.add %132, %133 : i64 | |
| %135 = llvm.mul %118, %51 : i64 | |
| %136 = llvm.add %107, %135 : i64 | |
| %137 = llvm.icmp "slt" %0, %136 : i64 | |
| %138 = llvm.select %137, %0, %136 : i1, i64 | |
| %139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %140 = llvm.mul %120, %61 : i64 | |
| %141 = llvm.add %112, %140 : i64 | |
| %142 = llvm.mul %118, %16 : i64 | |
| %143 = llvm.add %141, %142 : i64 | |
| %144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %145 = llvm.add %115, %131 : i64 | |
| %146 = llvm.add %145, %142 : i64 | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%147: i64): // 2 preds: ^bb15, ^bb21 | |
| %148 = llvm.icmp "slt" %147, %125 : i64 | |
| llvm.cond_br %148, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%149: i64): // 2 preds: ^bb16, ^bb20 | |
| %150 = llvm.icmp "slt" %149, %138 : i64 | |
| llvm.cond_br %150, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%151: i64): // 2 preds: ^bb17, ^bb19 | |
| %152 = llvm.icmp "slt" %151, %129 : i64 | |
| llvm.cond_br %152, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %153 = llvm.mul %147, %3 : i64 | |
| %154 = llvm.add %134, %153 : i64 | |
| %155 = llvm.add %154, %151 : i64 | |
| %156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %157 = llvm.load %156 : !llvm.ptr<f32> | |
| %158 = llvm.mul %151, %3 : i64 | |
| %159 = llvm.add %143, %158 : i64 | |
| %160 = llvm.add %159, %149 : i64 | |
| %161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %162 = llvm.load %161 : !llvm.ptr<f32> | |
| %163 = llvm.add %146, %153 : i64 | |
| %164 = llvm.add %163, %149 : i64 | |
| %165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %166 = llvm.load %165 : !llvm.ptr<f32> | |
| %167 = llvm.fmul %157, %162 : f32 | |
| %168 = llvm.fadd %166, %167 : f32 | |
| llvm.store %168, %165 : !llvm.ptr<f32> | |
| %169 = llvm.add %151, %6 : i64 | |
| llvm.br ^bb18(%169 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %170 = llvm.add %149, %6 : i64 | |
| llvm.br ^bb17(%170 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %171 = llvm.add %147, %6 : i64 | |
| llvm.br ^bb16(%171 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %172 = llvm.add %120, %0 : i64 | |
| llvm.br ^bb14(%172 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %173 = llvm.add %118, %0 : i64 | |
| llvm.br ^bb13(%173 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %174 = llvm.add %116, %0 : i64 | |
| llvm.br ^bb12(%174 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %175 = llvm.add %93, %1 : i64 | |
| llvm.br ^bb10(%175 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %176 = llvm.add %91, %1 : i64 | |
| llvm.br ^bb9(%176 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %177 = llvm.add %49, %48 : i64 | |
| llvm.br ^bb3(%177 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %178 = llvm.add %45, %44 : i64 | |
| llvm.br ^bb1(%178 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %179 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %179 : i32 | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- // | |
| hal.executable private @matmul_test_dispatch_0 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(0 : index) : i64 | |
| %1 = llvm.mlir.constant(10 : index) : i64 | |
| %2 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %3 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %4 = llvm.mlir.constant(1 : index) : i64 | |
| %5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %7 = llvm.mlir.constant(0 : i64) : i64 | |
| %8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = llvm.load %8 : !llvm.ptr<ptr<i8>> | |
| %10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32> | |
| %14 = llvm.zext %13 : i32 to i64 | |
| %15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32> | |
| %18 = llvm.zext %17 : i32 to i64 | |
| %19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32> | |
| %21 = llvm.zext %20 : i32 to i64 | |
| %22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32> | |
| %25 = llvm.zext %24 : i32 to i64 | |
| %26 = llvm.mlir.constant(64 : index) : i64 | |
| %27 = llvm.mul %21, %26 : i64 | |
| %28 = llvm.mul %25, %26 : i64 | |
| llvm.br ^bb1(%27 : i64) | |
| ^bb1(%29: i64): // 2 preds: ^bb0, ^bb10 | |
| %30 = llvm.icmp "slt" %29, %1 : i64 | |
| llvm.cond_br %30, ^bb2, ^bb11 | |
| ^bb2: // pred: ^bb1 | |
| %31 = llvm.mul %14, %26 : i64 | |
| %32 = llvm.mul %18, %26 : i64 | |
| llvm.br ^bb3(%31 : i64) | |
| ^bb3(%33: i64): // 2 preds: ^bb2, ^bb9 | |
| %34 = llvm.icmp "slt" %33, %1 : i64 | |
| llvm.cond_br %34, ^bb4, ^bb10 | |
| ^bb4: // pred: ^bb3 | |
| %35 = llvm.mlir.constant(-1 : index) : i64 | |
| %36 = llvm.mul %29, %35 : i64 | |
| %37 = llvm.add %36, %1 : i64 | |
| %38 = llvm.icmp "slt" %26, %37 : i64 | |
| %39 = llvm.select %38, %26, %37 : i1, i64 | |
| %40 = llvm.mul %33, %35 : i64 | |
| %41 = llvm.add %40, %1 : i64 | |
| %42 = llvm.icmp "slt" %26, %41 : i64 | |
| %43 = llvm.select %42, %26, %41 : i1, i64 | |
| %44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %45 = llvm.mul %29, %1 : i64 | |
| %46 = llvm.add %0, %45 : i64 | |
| %47 = llvm.mul %33, %4 : i64 | |
| %48 = llvm.add %46, %47 : i64 | |
| llvm.br ^bb5(%0 : i64) | |
| ^bb5(%49: i64): // 2 preds: ^bb4, ^bb8 | |
| %50 = llvm.icmp "slt" %49, %39 : i64 | |
| llvm.cond_br %50, ^bb6(%0 : i64), ^bb9 | |
| ^bb6(%51: i64): // 2 preds: ^bb5, ^bb7 | |
| %52 = llvm.icmp "slt" %51, %43 : i64 | |
| llvm.cond_br %52, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %53 = llvm.add %49, %29 : i64 | |
| %54 = llvm.add %51, %33 : i64 | |
| %55 = llvm.icmp "eq" %53, %54 : i64 | |
| %56 = llvm.select %55, %2, %3 : i1, f32 | |
| %57 = llvm.mul %49, %1 : i64 | |
| %58 = llvm.add %48, %57 : i64 | |
| %59 = llvm.add %58, %51 : i64 | |
| %60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %56, %60 : !llvm.ptr<f32> | |
| %61 = llvm.add %51, %4 : i64 | |
| llvm.br ^bb6(%61 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %62 = llvm.add %49, %4 : i64 | |
| llvm.br ^bb5(%62 : i64) | |
| ^bb9: // pred: ^bb5 | |
| %63 = llvm.add %33, %32 : i64 | |
| llvm.br ^bb3(%63 : i64) | |
| ^bb10: // pred: ^bb3 | |
| %64 = llvm.add %29, %28 : i64 | |
| llvm.br ^bb1(%64 : i64) | |
| ^bb11: // pred: ^bb1 | |
| %65 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %65 : i32 | |
| } | |
| } | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- // | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %16 = llvm.mlir.constant(1 : i64) : i64 | |
| %17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %18 = llvm.load %17 : !llvm.ptr<ptr<i8>> | |
| %19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %23 = llvm.mlir.constant(2 : i64) : i64 | |
| %24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %25 = llvm.load %24 : !llvm.ptr<ptr<i8>> | |
| %26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32> | |
| %30 = llvm.zext %29 : i32 to i64 | |
| %31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32> | |
| %34 = llvm.zext %33 : i32 to i64 | |
| %35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32> | |
| %37 = llvm.zext %36 : i32 to i64 | |
| %38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32> | |
| %41 = llvm.zext %40 : i32 to i64 | |
| %42 = llvm.mlir.constant(64 : index) : i64 | |
| %43 = llvm.mul %37, %42 : i64 | |
| %44 = llvm.mul %41, %42 : i64 | |
| llvm.br ^bb1(%43 : i64) | |
| ^bb1(%45: i64): // 2 preds: ^bb0, ^bb28 | |
| %46 = llvm.icmp "slt" %45, %3 : i64 | |
| llvm.cond_br %46, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %47 = llvm.mul %30, %42 : i64 | |
| %48 = llvm.mul %34, %42 : i64 | |
| llvm.br ^bb3(%47 : i64) | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb27 | |
| %50 = llvm.icmp "slt" %49, %3 : i64 | |
| llvm.cond_br %50, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %51 = llvm.mlir.constant(-1 : index) : i64 | |
| %52 = llvm.mul %45, %51 : i64 | |
| %53 = llvm.add %52, %3 : i64 | |
| %54 = llvm.icmp "slt" %42, %53 : i64 | |
| %55 = llvm.select %54, %42, %53 : i1, i64 | |
| %56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %57 = llvm.mul %45, %3 : i64 | |
| %58 = llvm.add %2, %57 : i64 | |
| %59 = llvm.mul %9, %6 : i64 | |
| %60 = llvm.add %58, %59 : i64 | |
| %61 = llvm.mlir.constant(10 : i64) : i64 | |
| %62 = llvm.mul %49, %51 : i64 | |
| %63 = llvm.add %62, %3 : i64 | |
| %64 = llvm.icmp "slt" %42, %63 : i64 | |
| %65 = llvm.select %64, %42, %63 : i1, i64 | |
| %66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %67 = llvm.mul %9, %3 : i64 | |
| %68 = llvm.add %2, %67 : i64 | |
| %69 = llvm.mul %49, %6 : i64 | |
| %70 = llvm.add %68, %69 : i64 | |
| %71 = llvm.icmp "slt" %53, %42 : i64 | |
| %72 = llvm.select %71, %53, %42 : i1, i64 | |
| %73 = llvm.icmp "slt" %63, %42 : i64 | |
| %74 = llvm.select %73, %63, %42 : i1, i64 | |
| %75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %76 = llvm.add %58, %69 : i64 | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%77: i64): // 2 preds: ^bb4, ^bb8 | |
| %78 = llvm.icmp "slt" %77, %72 : i64 | |
| llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%79: i64): // 2 preds: ^bb5, ^bb7 | |
| %80 = llvm.icmp "slt" %79, %74 : i64 | |
| llvm.cond_br %80, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %81 = llvm.add %77, %45 : i64 | |
| %82 = llvm.add %79, %49 : i64 | |
| %83 = llvm.icmp "eq" %81, %82 : i64 | |
| %84 = llvm.select %83, %4, %5 : i1, f32 | |
| %85 = llvm.mul %77, %3 : i64 | |
| %86 = llvm.add %76, %85 : i64 | |
| %87 = llvm.add %86, %79 : i64 | |
| %88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %84, %88 : !llvm.ptr<f32> | |
| %89 = llvm.add %79, %6 : i64 | |
| llvm.br ^bb6(%89 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %90 = llvm.add %77, %6 : i64 | |
| llvm.br ^bb5(%90 : i64) | |
| ^bb9(%91: i64): // 2 preds: ^bb5, ^bb26 | |
| %92 = llvm.icmp "slt" %91, %55 : i64 | |
| llvm.cond_br %92, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%93: i64): // 2 preds: ^bb9, ^bb25 | |
| %94 = llvm.icmp "slt" %93, %65 : i64 | |
| llvm.cond_br %94, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %95 = llvm.mul %91, %51 : i64 | |
| %96 = llvm.add %55, %95 : i64 | |
| %97 = llvm.icmp "slt" %1, %96 : i64 | |
| %98 = llvm.select %97, %1, %96 : i1, i64 | |
| %99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %100 = llvm.mul %91, %61 : i64 | |
| %101 = llvm.add %60, %100 : i64 | |
| %102 = llvm.mul %9, %16 : i64 | |
| %103 = llvm.add %101, %102 : i64 | |
| %104 = llvm.mul %93, %51 : i64 | |
| %105 = llvm.add %65, %104 : i64 | |
| %106 = llvm.icmp "slt" %1, %105 : i64 | |
| %107 = llvm.select %106, %1, %105 : i1, i64 | |
| %108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %109 = llvm.mul %9, %61 : i64 | |
| %110 = llvm.add %70, %109 : i64 | |
| %111 = llvm.mul %93, %16 : i64 | |
| %112 = llvm.add %110, %111 : i64 | |
| %113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %114 = llvm.add %76, %100 : i64 | |
| %115 = llvm.add %114, %111 : i64 | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%116: i64): // 2 preds: ^bb11, ^bb24 | |
| %117 = llvm.icmp "slt" %116, %98 : i64 | |
| llvm.cond_br %117, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%118: i64): // 2 preds: ^bb12, ^bb23 | |
| %119 = llvm.icmp "slt" %118, %107 : i64 | |
| llvm.cond_br %119, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%120: i64): // 2 preds: ^bb13, ^bb22 | |
| %121 = llvm.icmp "slt" %120, %3 : i64 | |
| llvm.cond_br %121, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %122 = llvm.mul %116, %51 : i64 | |
| %123 = llvm.add %98, %122 : i64 | |
| %124 = llvm.icmp "slt" %0, %123 : i64 | |
| %125 = llvm.select %124, %0, %123 : i1, i64 | |
| %126 = llvm.mul %120, %51 : i64 | |
| %127 = llvm.add %126, %3 : i64 | |
| %128 = llvm.icmp "slt" %0, %127 : i64 | |
| %129 = llvm.select %128, %0, %127 : i1, i64 | |
| %130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %131 = llvm.mul %116, %61 : i64 | |
| %132 = llvm.add %103, %131 : i64 | |
| %133 = llvm.mul %120, %16 : i64 | |
| %134 = llvm.add %132, %133 : i64 | |
| %135 = llvm.mul %118, %51 : i64 | |
| %136 = llvm.add %107, %135 : i64 | |
| %137 = llvm.icmp "slt" %0, %136 : i64 | |
| %138 = llvm.select %137, %0, %136 : i1, i64 | |
| %139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %140 = llvm.mul %120, %61 : i64 | |
| %141 = llvm.add %112, %140 : i64 | |
| %142 = llvm.mul %118, %16 : i64 | |
| %143 = llvm.add %141, %142 : i64 | |
| %144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %145 = llvm.add %115, %131 : i64 | |
| %146 = llvm.add %145, %142 : i64 | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%147: i64): // 2 preds: ^bb15, ^bb21 | |
| %148 = llvm.icmp "slt" %147, %125 : i64 | |
| llvm.cond_br %148, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%149: i64): // 2 preds: ^bb16, ^bb20 | |
| %150 = llvm.icmp "slt" %149, %138 : i64 | |
| llvm.cond_br %150, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%151: i64): // 2 preds: ^bb17, ^bb19 | |
| %152 = llvm.icmp "slt" %151, %129 : i64 | |
| llvm.cond_br %152, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %153 = llvm.mul %147, %3 : i64 | |
| %154 = llvm.add %134, %153 : i64 | |
| %155 = llvm.add %154, %151 : i64 | |
| %156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %157 = llvm.load %156 : !llvm.ptr<f32> | |
| %158 = llvm.mul %151, %3 : i64 | |
| %159 = llvm.add %143, %158 : i64 | |
| %160 = llvm.add %159, %149 : i64 | |
| %161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %162 = llvm.load %161 : !llvm.ptr<f32> | |
| %163 = llvm.add %146, %153 : i64 | |
| %164 = llvm.add %163, %149 : i64 | |
| %165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %166 = llvm.load %165 : !llvm.ptr<f32> | |
| %167 = llvm.fmul %157, %162 : f32 | |
| %168 = llvm.fadd %166, %167 : f32 | |
| llvm.store %168, %165 : !llvm.ptr<f32> | |
| %169 = llvm.add %151, %6 : i64 | |
| llvm.br ^bb18(%169 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %170 = llvm.add %149, %6 : i64 | |
| llvm.br ^bb17(%170 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %171 = llvm.add %147, %6 : i64 | |
| llvm.br ^bb16(%171 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %172 = llvm.add %120, %0 : i64 | |
| llvm.br ^bb14(%172 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %173 = llvm.add %118, %0 : i64 | |
| llvm.br ^bb13(%173 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %174 = llvm.add %116, %0 : i64 | |
| llvm.br ^bb12(%174 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %175 = llvm.add %93, %1 : i64 | |
| llvm.br ^bb10(%175 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %176 = llvm.add %91, %1 : i64 | |
| llvm.br ^bb9(%176 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %177 = llvm.add %49, %48 : i64 | |
| llvm.br ^bb3(%177 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %178 = llvm.add %45, %44 : i64 | |
| llvm.br ^bb1(%178 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %179 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %179 : i32 | |
| } | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- // | |
| hal.executable private @matmul_test_dispatch_2 { | |
| hal.interface public @io { | |
| hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
| hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
| } | |
| hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> { | |
| hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %c1 = constant 1 : index | |
| %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0] | |
| %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1] | |
| hal.return %0, %1, %c1 : index, index, index | |
| } | |
| builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { | |
| llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} { | |
| %0 = llvm.mlir.constant(4 : index) : i64 | |
| %1 = llvm.mlir.constant(32 : index) : i64 | |
| %2 = llvm.mlir.constant(0 : index) : i64 | |
| %3 = llvm.mlir.constant(10 : index) : i64 | |
| %4 = llvm.mlir.constant(0.000000e+00 : f32) : f32 | |
| %5 = llvm.mlir.constant(1.000000e+00 : f32) : f32 | |
| %6 = llvm.mlir.constant(1 : index) : i64 | |
| %7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %9 = llvm.mlir.constant(0 : i64) : i64 | |
| %10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = llvm.load %10 : !llvm.ptr<ptr<i8>> | |
| %12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %16 = llvm.mlir.constant(1 : i64) : i64 | |
| %17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %18 = llvm.load %17 : !llvm.ptr<ptr<i8>> | |
| %19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %23 = llvm.mlir.constant(2 : i64) : i64 | |
| %24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %25 = llvm.load %24 : !llvm.ptr<ptr<i8>> | |
| %26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32> | |
| %28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32> | |
| %30 = llvm.zext %29 : i32 to i64 | |
| %31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32> | |
| %34 = llvm.zext %33 : i32 to i64 | |
| %35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>> | |
| %36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32> | |
| %37 = llvm.zext %36 : i32 to i64 | |
| %38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>> | |
| %39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32> | |
| %41 = llvm.zext %40 : i32 to i64 | |
| %42 = llvm.mlir.constant(64 : index) : i64 | |
| %43 = llvm.mul %37, %42 : i64 | |
| %44 = llvm.mul %41, %42 : i64 | |
| llvm.br ^bb1(%43 : i64) | |
| ^bb1(%45: i64): // 2 preds: ^bb0, ^bb28 | |
| %46 = llvm.icmp "slt" %45, %3 : i64 | |
| llvm.cond_br %46, ^bb2, ^bb29 | |
| ^bb2: // pred: ^bb1 | |
| %47 = llvm.mul %30, %42 : i64 | |
| %48 = llvm.mul %34, %42 : i64 | |
| llvm.br ^bb3(%47 : i64) | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb27 | |
| %50 = llvm.icmp "slt" %49, %3 : i64 | |
| llvm.cond_br %50, ^bb4, ^bb28 | |
| ^bb4: // pred: ^bb3 | |
| %51 = llvm.mlir.constant(-1 : index) : i64 | |
| %52 = llvm.mul %45, %51 : i64 | |
| %53 = llvm.add %52, %3 : i64 | |
| %54 = llvm.icmp "slt" %42, %53 : i64 | |
| %55 = llvm.select %54, %42, %53 : i1, i64 | |
| %56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %57 = llvm.mul %45, %3 : i64 | |
| %58 = llvm.add %2, %57 : i64 | |
| %59 = llvm.mul %9, %6 : i64 | |
| %60 = llvm.add %58, %59 : i64 | |
| %61 = llvm.mlir.constant(10 : i64) : i64 | |
| %62 = llvm.mul %49, %51 : i64 | |
| %63 = llvm.add %62, %3 : i64 | |
| %64 = llvm.icmp "slt" %42, %63 : i64 | |
| %65 = llvm.select %64, %42, %63 : i1, i64 | |
| %66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %67 = llvm.mul %9, %3 : i64 | |
| %68 = llvm.add %2, %67 : i64 | |
| %69 = llvm.mul %49, %6 : i64 | |
| %70 = llvm.add %68, %69 : i64 | |
| %71 = llvm.icmp "slt" %53, %42 : i64 | |
| %72 = llvm.select %71, %53, %42 : i1, i64 | |
| %73 = llvm.icmp "slt" %63, %42 : i64 | |
| %74 = llvm.select %73, %63, %42 : i1, i64 | |
| %75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %76 = llvm.add %58, %69 : i64 | |
| llvm.br ^bb5(%2 : i64) | |
| ^bb5(%77: i64): // 2 preds: ^bb4, ^bb8 | |
| %78 = llvm.icmp "slt" %77, %72 : i64 | |
| llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64) | |
| ^bb6(%79: i64): // 2 preds: ^bb5, ^bb7 | |
| %80 = llvm.icmp "slt" %79, %74 : i64 | |
| llvm.cond_br %80, ^bb7, ^bb8 | |
| ^bb7: // pred: ^bb6 | |
| %81 = llvm.add %77, %45 : i64 | |
| %82 = llvm.add %79, %49 : i64 | |
| %83 = llvm.icmp "eq" %81, %82 : i64 | |
| %84 = llvm.select %83, %4, %5 : i1, f32 | |
| %85 = llvm.mul %77, %3 : i64 | |
| %86 = llvm.add %76, %85 : i64 | |
| %87 = llvm.add %86, %79 : i64 | |
| %88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| llvm.store %84, %88 : !llvm.ptr<f32> | |
| %89 = llvm.add %79, %6 : i64 | |
| llvm.br ^bb6(%89 : i64) | |
| ^bb8: // pred: ^bb6 | |
| %90 = llvm.add %77, %6 : i64 | |
| llvm.br ^bb5(%90 : i64) | |
| ^bb9(%91: i64): // 2 preds: ^bb5, ^bb26 | |
| %92 = llvm.icmp "slt" %91, %55 : i64 | |
| llvm.cond_br %92, ^bb10(%2 : i64), ^bb27 | |
| ^bb10(%93: i64): // 2 preds: ^bb9, ^bb25 | |
| %94 = llvm.icmp "slt" %93, %65 : i64 | |
| llvm.cond_br %94, ^bb11, ^bb26 | |
| ^bb11: // pred: ^bb10 | |
| %95 = llvm.mul %91, %51 : i64 | |
| %96 = llvm.add %55, %95 : i64 | |
| %97 = llvm.icmp "slt" %1, %96 : i64 | |
| %98 = llvm.select %97, %1, %96 : i1, i64 | |
| %99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %100 = llvm.mul %91, %61 : i64 | |
| %101 = llvm.add %60, %100 : i64 | |
| %102 = llvm.mul %9, %16 : i64 | |
| %103 = llvm.add %101, %102 : i64 | |
| %104 = llvm.mul %93, %51 : i64 | |
| %105 = llvm.add %65, %104 : i64 | |
| %106 = llvm.icmp "slt" %1, %105 : i64 | |
| %107 = llvm.select %106, %1, %105 : i1, i64 | |
| %108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %109 = llvm.mul %9, %61 : i64 | |
| %110 = llvm.add %70, %109 : i64 | |
| %111 = llvm.mul %93, %16 : i64 | |
| %112 = llvm.add %110, %111 : i64 | |
| %113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %114 = llvm.add %76, %100 : i64 | |
| %115 = llvm.add %114, %111 : i64 | |
| llvm.br ^bb12(%2 : i64) | |
| ^bb12(%116: i64): // 2 preds: ^bb11, ^bb24 | |
| %117 = llvm.icmp "slt" %116, %98 : i64 | |
| llvm.cond_br %117, ^bb13(%2 : i64), ^bb25 | |
| ^bb13(%118: i64): // 2 preds: ^bb12, ^bb23 | |
| %119 = llvm.icmp "slt" %118, %107 : i64 | |
| llvm.cond_br %119, ^bb14(%2 : i64), ^bb24 | |
| ^bb14(%120: i64): // 2 preds: ^bb13, ^bb22 | |
| %121 = llvm.icmp "slt" %120, %3 : i64 | |
| llvm.cond_br %121, ^bb15, ^bb23 | |
| ^bb15: // pred: ^bb14 | |
| %122 = llvm.mul %116, %51 : i64 | |
| %123 = llvm.add %98, %122 : i64 | |
| %124 = llvm.icmp "slt" %0, %123 : i64 | |
| %125 = llvm.select %124, %0, %123 : i1, i64 | |
| %126 = llvm.mul %120, %51 : i64 | |
| %127 = llvm.add %126, %3 : i64 | |
| %128 = llvm.icmp "slt" %0, %127 : i64 | |
| %129 = llvm.select %128, %0, %127 : i1, i64 | |
| %130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %131 = llvm.mul %116, %61 : i64 | |
| %132 = llvm.add %103, %131 : i64 | |
| %133 = llvm.mul %120, %16 : i64 | |
| %134 = llvm.add %132, %133 : i64 | |
| %135 = llvm.mul %118, %51 : i64 | |
| %136 = llvm.add %107, %135 : i64 | |
| %137 = llvm.icmp "slt" %0, %136 : i64 | |
| %138 = llvm.select %137, %0, %136 : i1, i64 | |
| %139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %140 = llvm.mul %120, %61 : i64 | |
| %141 = llvm.add %112, %140 : i64 | |
| %142 = llvm.mul %118, %16 : i64 | |
| %143 = llvm.add %141, %142 : i64 | |
| %144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32> | |
| %145 = llvm.add %115, %131 : i64 | |
| %146 = llvm.add %145, %142 : i64 | |
| llvm.br ^bb16(%2 : i64) | |
| ^bb16(%147: i64): // 2 preds: ^bb15, ^bb21 | |
| %148 = llvm.icmp "slt" %147, %125 : i64 | |
| llvm.cond_br %148, ^bb17(%2 : i64), ^bb22 | |
| ^bb17(%149: i64): // 2 preds: ^bb16, ^bb20 | |
| %150 = llvm.icmp "slt" %149, %138 : i64 | |
| llvm.cond_br %150, ^bb18(%2 : i64), ^bb21 | |
| ^bb18(%151: i64): // 2 preds: ^bb17, ^bb19 | |
| %152 = llvm.icmp "slt" %151, %129 : i64 | |
| llvm.cond_br %152, ^bb19, ^bb20 | |
| ^bb19: // pred: ^bb18 | |
| %153 = llvm.mul %147, %3 : i64 | |
| %154 = llvm.add %134, %153 : i64 | |
| %155 = llvm.add %154, %151 : i64 | |
| %156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %157 = llvm.load %156 : !llvm.ptr<f32> | |
| %158 = llvm.mul %151, %3 : i64 | |
| %159 = llvm.add %143, %158 : i64 | |
| %160 = llvm.add %159, %149 : i64 | |
| %161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %162 = llvm.load %161 : !llvm.ptr<f32> | |
| %163 = llvm.add %146, %153 : i64 | |
| %164 = llvm.add %163, %149 : i64 | |
| %165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %166 = llvm.load %165 : !llvm.ptr<f32> | |
| %167 = llvm.fmul %157, %162 : f32 | |
| %168 = llvm.fadd %166, %167 : f32 | |
| llvm.store %168, %165 : !llvm.ptr<f32> | |
| %169 = llvm.add %151, %6 : i64 | |
| llvm.br ^bb18(%169 : i64) | |
| ^bb20: // pred: ^bb18 | |
| %170 = llvm.add %149, %6 : i64 | |
| llvm.br ^bb17(%170 : i64) | |
| ^bb21: // pred: ^bb17 | |
| %171 = llvm.add %147, %6 : i64 | |
| llvm.br ^bb16(%171 : i64) | |
| ^bb22: // pred: ^bb16 | |
| %172 = llvm.add %120, %0 : i64 | |
| llvm.br ^bb14(%172 : i64) | |
| ^bb23: // pred: ^bb14 | |
| %173 = llvm.add %118, %0 : i64 | |
| llvm.br ^bb13(%173 : i64) | |
| ^bb24: // pred: ^bb13 | |
| %174 = llvm.add %116, %0 : i64 | |
| llvm.br ^bb12(%174 : i64) | |
| ^bb25: // pred: ^bb12 | |
| %175 = llvm.add %93, %1 : i64 | |
| llvm.br ^bb10(%175 : i64) | |
| ^bb26: // pred: ^bb10 | |
| %176 = llvm.add %91, %1 : i64 | |
| llvm.br ^bb9(%176 : i64) | |
| ^bb27: // pred: ^bb9 | |
| %177 = llvm.add %49, %48 : i64 | |
| llvm.br ^bb3(%177 : i64) | |
| ^bb28: // pred: ^bb3 | |
| %178 = llvm.add %45, %44 : i64 | |
| llvm.br ^bb1(%178 : i64) | |
| ^bb29: // pred: ^bb1 | |
| %179 = llvm.mlir.constant(0 : i32) : i32 | |
| llvm.return %179 : i32 | |
| } | |
| } | |
| } | |
| } | |
| <stdin>:11:10: error: unhandled multiple roots in dispatch region | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:60:10: note: called from | |
| %7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| ^ | |
| <stdin>:11:10: note: see current operation: %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:11:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:60:10: note: called from | |
| %7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| ^ | |
| <stdin>:11:10: note: see current operation: "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:11:10: error: failed to serialize executables | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:60:10: note: called from | |
| %7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| ^ | |
| <stdin>:11:10: note: see current operation: "hal.executable"() ( { | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io"} : () -> () | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> () | |
| %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { | |
| ^ | |
| <stdin>:5:1: error: conversion from source -> vm failed | |
| module { | |
| ^ | |
| <stdin>:5:1: note: see current operation: "builtin.module"() ( { | |
| "hal.executable"() ( { | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 6 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io"} : () -> () | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() ( { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %0 = "std.constant"() {value = 1 : index} : () -> index | |
| %1 = "affine.apply"(%arg0) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index | |
| %2 = "affine.apply"(%arg1) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index | |
| "hal.return"(%1, %2, %0) : (index, index, index) -> () | |
| }) {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_0", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "llvm.func"() ( { | |
| ^bb0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>): // no predecessors | |
| %0 = "llvm.mlir.constant"() {value = 0 : index} : () -> i64 | |
| %1 = "llvm.mlir.constant"() {value = 10 : index} : () -> i64 | |
| %2 = "llvm.mlir.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %3 = "llvm.mlir.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %4 = "llvm.mlir.constant"() {value = 1 : index} : () -> i64 | |
| %5 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %6 = "llvm.extractvalue"(%5) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>> | |
| %7 = "llvm.mlir.constant"() {value = 0 : i64} : () -> i64 | |
| %8 = "llvm.getelementptr"(%6, %7) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %9 = "llvm.load"(%8) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8> | |
| %10 = "llvm.getelementptr"(%9, %0) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %11 = "llvm.bitcast"(%10) : (!llvm.ptr<i8>) -> !llvm.ptr<f32> | |
| %12 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32> | |
| %13 = "llvm.extractvalue"(%12) {position = [0]} : (!llvm.array<3 x i32>) -> i32 | |
| %14 = "llvm.zext"(%13) : (i32) -> i64 | |
| %15 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %16 = "llvm.extractvalue"(%15) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32> | |
| %17 = "llvm.extractvalue"(%16) {position = [0]} : (!llvm.array<3 x i32>) -> i32 | |
| %18 = "llvm.zext"(%17) : (i32) -> i64 | |
| %19 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32> | |
| %20 = "llvm.extractvalue"(%19) {position = [1]} : (!llvm.array<3 x i32>) -> i32 | |
| %21 = "llvm.zext"(%20) : (i32) -> i64 | |
| %22 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %23 = "llvm.extractvalue"(%22) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32> | |
| %24 = "llvm.extractvalue"(%23) {position = [1]} : (!llvm.array<3 x i32>) -> i32 | |
| %25 = "llvm.zext"(%24) : (i32) -> i64 | |
| %26 = "llvm.mlir.constant"() {value = 64 : index} : () -> i64 | |
| %27 = "llvm.mul"(%21, %26) : (i64, i64) -> i64 | |
| %28 = "llvm.mul"(%25, %26) : (i64, i64) -> i64 | |
| "llvm.br"(%27)[^bb1] : (i64) -> () | |
| ^bb1(%29: i64): // 2 preds: ^bb0, ^bb10 | |
| %30 = "llvm.icmp"(%29, %1) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%30)[^bb2, ^bb11] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb2: // pred: ^bb1 | |
| %31 = "llvm.mul"(%14, %26) : (i64, i64) -> i64 | |
| %32 = "llvm.mul"(%18, %26) : (i64, i64) -> i64 | |
| "llvm.br"(%31)[^bb3] : (i64) -> () | |
| ^bb3(%33: i64): // 2 preds: ^bb2, ^bb9 | |
| %34 = "llvm.icmp"(%33, %1) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%34)[^bb4, ^bb10] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb4: // pred: ^bb3 | |
| %35 = "llvm.mlir.constant"() {value = -1 : index} : () -> i64 | |
| %36 = "llvm.mul"(%29, %35) : (i64, i64) -> i64 | |
| %37 = "llvm.add"(%36, %1) : (i64, i64) -> i64 | |
| %38 = "llvm.icmp"(%26, %37) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %39 = "llvm.select"(%38, %26, %37) : (i1, i64, i64) -> i64 | |
| %40 = "llvm.mul"(%33, %35) : (i64, i64) -> i64 | |
| %41 = "llvm.add"(%40, %1) : (i64, i64) -> i64 | |
| %42 = "llvm.icmp"(%26, %41) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %43 = "llvm.select"(%42, %26, %41) : (i1, i64, i64) -> i64 | |
| %44 = "llvm.bitcast"(%11) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %45 = "llvm.mul"(%29, %1) : (i64, i64) -> i64 | |
| %46 = "llvm.add"(%0, %45) : (i64, i64) -> i64 | |
| %47 = "llvm.mul"(%33, %4) : (i64, i64) -> i64 | |
| %48 = "llvm.add"(%46, %47) : (i64, i64) -> i64 | |
| "llvm.br"(%0)[^bb5] : (i64) -> () | |
| ^bb5(%49: i64): // 2 preds: ^bb4, ^bb8 | |
| %50 = "llvm.icmp"(%49, %39) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%50, %0)[^bb6, ^bb9] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb6(%51: i64): // 2 preds: ^bb5, ^bb7 | |
| %52 = "llvm.icmp"(%51, %43) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%52)[^bb7, ^bb8] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb7: // pred: ^bb6 | |
| %53 = "llvm.add"(%49, %29) : (i64, i64) -> i64 | |
| %54 = "llvm.add"(%51, %33) : (i64, i64) -> i64 | |
| %55 = "llvm.icmp"(%53, %54) {predicate = 0 : i64} : (i64, i64) -> i1 | |
| %56 = "llvm.select"(%55, %2, %3) : (i1, f32, f32) -> f32 | |
| %57 = "llvm.mul"(%49, %1) : (i64, i64) -> i64 | |
| %58 = "llvm.add"(%48, %57) : (i64, i64) -> i64 | |
| %59 = "llvm.add"(%58, %51) : (i64, i64) -> i64 | |
| %60 = "llvm.getelementptr"(%44, %59) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| "llvm.store"(%56, %60) : (f32, !llvm.ptr<f32>) -> () | |
| %61 = "llvm.add"(%51, %4) : (i64, i64) -> i64 | |
| "llvm.br"(%61)[^bb6] : (i64) -> () | |
| ^bb8: // pred: ^bb6 | |
| %62 = "llvm.add"(%49, %4) : (i64, i64) -> i64 | |
| "llvm.br"(%62)[^bb5] : (i64) -> () | |
| ^bb9: // pred: ^bb5 | |
| %63 = "llvm.add"(%33, %32) : (i64, i64) -> i64 | |
| "llvm.br"(%63)[^bb3] : (i64) -> () | |
| ^bb10: // pred: ^bb3 | |
| %64 = "llvm.add"(%29, %28) : (i64, i64) -> i64 | |
| "llvm.br"(%64)[^bb1] : (i64) -> () | |
| ^bb11: // pred: ^bb1 | |
| %65 = "llvm.mlir.constant"() {value = 0 : i32} : () -> i32 | |
| "llvm.return"(%65) : (i32) -> () | |
| }) {linkage = #llvm.linkage<internal>, sym_name = "matmul_test_dispatch_0", sym_visibility = "private", type = !llvm.func<i32 (ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, ptr<array<3 x i32>>, ptr<i8>)>} : () -> () | |
| }) {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_0", sym_visibility = "private"} : () -> () | |
| "hal.executable"() ( { | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io"} : () -> () | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() ( { | |
| ^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
| %0 = "std.constant"() {value = 1 : index} : () -> index | |
| %1 = "affine.apply"(%arg0) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index | |
| %2 = "affine.apply"(%arg1) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index | |
| "hal.return"(%1, %2, %0) : (index, index, index) -> () | |
| }) {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_2", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "llvm.func"() ( { | |
| ^bb0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>): // no predecessors | |
| %0 = "llvm.mlir.constant"() {value = 4 : index} : () -> i64 | |
| %1 = "llvm.mlir.constant"() {value = 32 : index} : () -> i64 | |
| %2 = "llvm.mlir.constant"() {value = 0 : index} : () -> i64 | |
| %3 = "llvm.mlir.constant"() {value = 10 : index} : () -> i64 | |
| %4 = "llvm.mlir.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %5 = "llvm.mlir.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %6 = "llvm.mlir.constant"() {value = 1 : index} : () -> i64 | |
| %7 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %8 = "llvm.extractvalue"(%7) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>> | |
| %9 = "llvm.mlir.constant"() {value = 0 : i64} : () -> i64 | |
| %10 = "llvm.getelementptr"(%8, %9) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %11 = "llvm.load"(%10) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8> | |
| %12 = "llvm.getelementptr"(%11, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %13 = "llvm.bitcast"(%12) : (!llvm.ptr<i8>) -> !llvm.ptr<f32> | |
| %14 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %15 = "llvm.extractvalue"(%14) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>> | |
| %16 = "llvm.mlir.constant"() {value = 1 : i64} : () -> i64 | |
| %17 = "llvm.getelementptr"(%15, %16) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %18 = "llvm.load"(%17) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8> | |
| %19 = "llvm.getelementptr"(%18, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %20 = "llvm.bitcast"(%19) : (!llvm.ptr<i8>) -> !llvm.ptr<f32> | |
| %21 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %22 = "llvm.extractvalue"(%21) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>> | |
| %23 = "llvm.mlir.constant"() {value = 2 : i64} : () -> i64 | |
| %24 = "llvm.getelementptr"(%22, %23) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>> | |
| %25 = "llvm.load"(%24) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8> | |
| %26 = "llvm.getelementptr"(%25, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8> | |
| %27 = "llvm.bitcast"(%26) : (!llvm.ptr<i8>) -> !llvm.ptr<f32> | |
| %28 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32> | |
| %29 = "llvm.extractvalue"(%28) {position = [0]} : (!llvm.array<3 x i32>) -> i32 | |
| %30 = "llvm.zext"(%29) : (i32) -> i64 | |
| %31 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %32 = "llvm.extractvalue"(%31) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32> | |
| %33 = "llvm.extractvalue"(%32) {position = [0]} : (!llvm.array<3 x i32>) -> i32 | |
| %34 = "llvm.zext"(%33) : (i32) -> i64 | |
| %35 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32> | |
| %36 = "llvm.extractvalue"(%35) {position = [1]} : (!llvm.array<3 x i32>) -> i32 | |
| %37 = "llvm.zext"(%36) : (i32) -> i64 | |
| %38 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)> | |
| %39 = "llvm.extractvalue"(%38) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32> | |
| %40 = "llvm.extractvalue"(%39) {position = [1]} : (!llvm.array<3 x i32>) -> i32 | |
| %41 = "llvm.zext"(%40) : (i32) -> i64 | |
| %42 = "llvm.mlir.constant"() {value = 64 : index} : () -> i64 | |
| %43 = "llvm.mul"(%37, %42) : (i64, i64) -> i64 | |
| %44 = "llvm.mul"(%41, %42) : (i64, i64) -> i64 | |
| "llvm.br"(%43)[^bb1] : (i64) -> () | |
| ^bb1(%45: i64): // 2 preds: ^bb0, ^bb28 | |
| %46 = "llvm.icmp"(%45, %3) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%46)[^bb2, ^bb29] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb2: // pred: ^bb1 | |
| %47 = "llvm.mul"(%30, %42) : (i64, i64) -> i64 | |
| %48 = "llvm.mul"(%34, %42) : (i64, i64) -> i64 | |
| "llvm.br"(%47)[^bb3] : (i64) -> () | |
| ^bb3(%49: i64): // 2 preds: ^bb2, ^bb27 | |
| %50 = "llvm.icmp"(%49, %3) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%50)[^bb4, ^bb28] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb4: // pred: ^bb3 | |
| %51 = "llvm.mlir.constant"() {value = -1 : index} : () -> i64 | |
| %52 = "llvm.mul"(%45, %51) : (i64, i64) -> i64 | |
| %53 = "llvm.add"(%52, %3) : (i64, i64) -> i64 | |
| %54 = "llvm.icmp"(%42, %53) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %55 = "llvm.select"(%54, %42, %53) : (i1, i64, i64) -> i64 | |
| %56 = "llvm.bitcast"(%13) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %57 = "llvm.mul"(%45, %3) : (i64, i64) -> i64 | |
| %58 = "llvm.add"(%2, %57) : (i64, i64) -> i64 | |
| %59 = "llvm.mul"(%9, %6) : (i64, i64) -> i64 | |
| %60 = "llvm.add"(%58, %59) : (i64, i64) -> i64 | |
| %61 = "llvm.mlir.constant"() {value = 10 : i64} : () -> i64 | |
| %62 = "llvm.mul"(%49, %51) : (i64, i64) -> i64 | |
| %63 = "llvm.add"(%62, %3) : (i64, i64) -> i64 | |
| %64 = "llvm.icmp"(%42, %63) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %65 = "llvm.select"(%64, %42, %63) : (i1, i64, i64) -> i64 | |
| %66 = "llvm.bitcast"(%20) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %67 = "llvm.mul"(%9, %3) : (i64, i64) -> i64 | |
| %68 = "llvm.add"(%2, %67) : (i64, i64) -> i64 | |
| %69 = "llvm.mul"(%49, %6) : (i64, i64) -> i64 | |
| %70 = "llvm.add"(%68, %69) : (i64, i64) -> i64 | |
| %71 = "llvm.icmp"(%53, %42) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %72 = "llvm.select"(%71, %53, %42) : (i1, i64, i64) -> i64 | |
| %73 = "llvm.icmp"(%63, %42) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %74 = "llvm.select"(%73, %63, %42) : (i1, i64, i64) -> i64 | |
| %75 = "llvm.bitcast"(%27) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %76 = "llvm.add"(%58, %69) : (i64, i64) -> i64 | |
| "llvm.br"(%2)[^bb5] : (i64) -> () | |
| ^bb5(%77: i64): // 2 preds: ^bb4, ^bb8 | |
| %78 = "llvm.icmp"(%77, %72) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%78, %2, %2)[^bb6, ^bb9] {operand_segment_sizes = dense<1> : vector<3xi32>} : (i1, i64, i64) -> () | |
| ^bb6(%79: i64): // 2 preds: ^bb5, ^bb7 | |
| %80 = "llvm.icmp"(%79, %74) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%80)[^bb7, ^bb8] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb7: // pred: ^bb6 | |
| %81 = "llvm.add"(%77, %45) : (i64, i64) -> i64 | |
| %82 = "llvm.add"(%79, %49) : (i64, i64) -> i64 | |
| %83 = "llvm.icmp"(%81, %82) {predicate = 0 : i64} : (i64, i64) -> i1 | |
| %84 = "llvm.select"(%83, %4, %5) : (i1, f32, f32) -> f32 | |
| %85 = "llvm.mul"(%77, %3) : (i64, i64) -> i64 | |
| %86 = "llvm.add"(%76, %85) : (i64, i64) -> i64 | |
| %87 = "llvm.add"(%86, %79) : (i64, i64) -> i64 | |
| %88 = "llvm.getelementptr"(%75, %87) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| "llvm.store"(%84, %88) : (f32, !llvm.ptr<f32>) -> () | |
| %89 = "llvm.add"(%79, %6) : (i64, i64) -> i64 | |
| "llvm.br"(%89)[^bb6] : (i64) -> () | |
| ^bb8: // pred: ^bb6 | |
| %90 = "llvm.add"(%77, %6) : (i64, i64) -> i64 | |
| "llvm.br"(%90)[^bb5] : (i64) -> () | |
| ^bb9(%91: i64): // 2 preds: ^bb5, ^bb26 | |
| %92 = "llvm.icmp"(%91, %55) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%92, %2)[^bb10, ^bb27] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb10(%93: i64): // 2 preds: ^bb9, ^bb25 | |
| %94 = "llvm.icmp"(%93, %65) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%94)[^bb11, ^bb26] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb11: // pred: ^bb10 | |
| %95 = "llvm.mul"(%91, %51) : (i64, i64) -> i64 | |
| %96 = "llvm.add"(%55, %95) : (i64, i64) -> i64 | |
| %97 = "llvm.icmp"(%1, %96) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %98 = "llvm.select"(%97, %1, %96) : (i1, i64, i64) -> i64 | |
| %99 = "llvm.bitcast"(%56) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %100 = "llvm.mul"(%91, %61) : (i64, i64) -> i64 | |
| %101 = "llvm.add"(%60, %100) : (i64, i64) -> i64 | |
| %102 = "llvm.mul"(%9, %16) : (i64, i64) -> i64 | |
| %103 = "llvm.add"(%101, %102) : (i64, i64) -> i64 | |
| %104 = "llvm.mul"(%93, %51) : (i64, i64) -> i64 | |
| %105 = "llvm.add"(%65, %104) : (i64, i64) -> i64 | |
| %106 = "llvm.icmp"(%1, %105) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %107 = "llvm.select"(%106, %1, %105) : (i1, i64, i64) -> i64 | |
| %108 = "llvm.bitcast"(%66) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %109 = "llvm.mul"(%9, %61) : (i64, i64) -> i64 | |
| %110 = "llvm.add"(%70, %109) : (i64, i64) -> i64 | |
| %111 = "llvm.mul"(%93, %16) : (i64, i64) -> i64 | |
| %112 = "llvm.add"(%110, %111) : (i64, i64) -> i64 | |
| %113 = "llvm.bitcast"(%75) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %114 = "llvm.add"(%76, %100) : (i64, i64) -> i64 | |
| %115 = "llvm.add"(%114, %111) : (i64, i64) -> i64 | |
| "llvm.br"(%2)[^bb12] : (i64) -> () | |
| ^bb12(%116: i64): // 2 preds: ^bb11, ^bb24 | |
| %117 = "llvm.icmp"(%116, %98) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%117, %2)[^bb13, ^bb25] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb13(%118: i64): // 2 preds: ^bb12, ^bb23 | |
| %119 = "llvm.icmp"(%118, %107) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%119, %2)[^bb14, ^bb24] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb14(%120: i64): // 2 preds: ^bb13, ^bb22 | |
| %121 = "llvm.icmp"(%120, %3) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%121)[^bb15, ^bb23] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb15: // pred: ^bb14 | |
| %122 = "llvm.mul"(%116, %51) : (i64, i64) -> i64 | |
| %123 = "llvm.add"(%98, %122) : (i64, i64) -> i64 | |
| %124 = "llvm.icmp"(%0, %123) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %125 = "llvm.select"(%124, %0, %123) : (i1, i64, i64) -> i64 | |
| %126 = "llvm.mul"(%120, %51) : (i64, i64) -> i64 | |
| %127 = "llvm.add"(%126, %3) : (i64, i64) -> i64 | |
| %128 = "llvm.icmp"(%0, %127) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %129 = "llvm.select"(%128, %0, %127) : (i1, i64, i64) -> i64 | |
| %130 = "llvm.bitcast"(%99) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %131 = "llvm.mul"(%116, %61) : (i64, i64) -> i64 | |
| %132 = "llvm.add"(%103, %131) : (i64, i64) -> i64 | |
| %133 = "llvm.mul"(%120, %16) : (i64, i64) -> i64 | |
| %134 = "llvm.add"(%132, %133) : (i64, i64) -> i64 | |
| %135 = "llvm.mul"(%118, %51) : (i64, i64) -> i64 | |
| %136 = "llvm.add"(%107, %135) : (i64, i64) -> i64 | |
| %137 = "llvm.icmp"(%0, %136) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| %138 = "llvm.select"(%137, %0, %136) : (i1, i64, i64) -> i64 | |
| %139 = "llvm.bitcast"(%108) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %140 = "llvm.mul"(%120, %61) : (i64, i64) -> i64 | |
| %141 = "llvm.add"(%112, %140) : (i64, i64) -> i64 | |
| %142 = "llvm.mul"(%118, %16) : (i64, i64) -> i64 | |
| %143 = "llvm.add"(%141, %142) : (i64, i64) -> i64 | |
| %144 = "llvm.bitcast"(%113) : (!llvm.ptr<f32>) -> !llvm.ptr<f32> | |
| %145 = "llvm.add"(%115, %131) : (i64, i64) -> i64 | |
| %146 = "llvm.add"(%145, %142) : (i64, i64) -> i64 | |
| "llvm.br"(%2)[^bb16] : (i64) -> () | |
| ^bb16(%147: i64): // 2 preds: ^bb15, ^bb21 | |
| %148 = "llvm.icmp"(%147, %125) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%148, %2)[^bb17, ^bb22] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb17(%149: i64): // 2 preds: ^bb16, ^bb20 | |
| %150 = "llvm.icmp"(%149, %138) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%150, %2)[^bb18, ^bb21] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> () | |
| ^bb18(%151: i64): // 2 preds: ^bb17, ^bb19 | |
| %152 = "llvm.icmp"(%151, %129) {predicate = 2 : i64} : (i64, i64) -> i1 | |
| "llvm.cond_br"(%152)[^bb19, ^bb20] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> () | |
| ^bb19: // pred: ^bb18 | |
| %153 = "llvm.mul"(%147, %3) : (i64, i64) -> i64 | |
| %154 = "llvm.add"(%134, %153) : (i64, i64) -> i64 | |
| %155 = "llvm.add"(%154, %151) : (i64, i64) -> i64 | |
| %156 = "llvm.getelementptr"(%130, %155) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %157 = "llvm.load"(%156) : (!llvm.ptr<f32>) -> f32 | |
| %158 = "llvm.mul"(%151, %3) : (i64, i64) -> i64 | |
| %159 = "llvm.add"(%143, %158) : (i64, i64) -> i64 | |
| %160 = "llvm.add"(%159, %149) : (i64, i64) -> i64 | |
| %161 = "llvm.getelementptr"(%139, %160) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %162 = "llvm.load"(%161) : (!llvm.ptr<f32>) -> f32 | |
| %163 = "llvm.add"(%146, %153) : (i64, i64) -> i64 | |
| %164 = "llvm.add"(%163, %149) : (i64, i64) -> i64 | |
| %165 = "llvm.getelementptr"(%144, %164) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32> | |
| %166 = "llvm.load"(%165) : (!llvm.ptr<f32>) -> f32 | |
| %167 = "llvm.fmul"(%157, %162) : (f32, f32) -> f32 | |
| %168 = "llvm.fadd"(%166, %167) : (f32, f32) -> f32 | |
| "llvm.store"(%168, %165) : (f32, !llvm.ptr<f32>) -> () | |
| %169 = "llvm.add"(%151, %6) : (i64, i64) -> i64 | |
| "llvm.br"(%169)[^bb18] : (i64) -> () | |
| ^bb20: // pred: ^bb18 | |
| %170 = "llvm.add"(%149, %6) : (i64, i64) -> i64 | |
| "llvm.br"(%170)[^bb17] : (i64) -> () | |
| ^bb21: // pred: ^bb17 | |
| %171 = "llvm.add"(%147, %6) : (i64, i64) -> i64 | |
| "llvm.br"(%171)[^bb16] : (i64) -> () | |
| ^bb22: // pred: ^bb16 | |
| %172 = "llvm.add"(%120, %0) : (i64, i64) -> i64 | |
| "llvm.br"(%172)[^bb14] : (i64) -> () | |
| ^bb23: // pred: ^bb14 | |
| %173 = "llvm.add"(%118, %0) : (i64, i64) -> i64 | |
| "llvm.br"(%173)[^bb13] : (i64) -> () | |
| ^bb24: // pred: ^bb13 | |
| %174 = "llvm.add"(%116, %0) : (i64, i64) -> i64 | |
| "llvm.br"(%174)[^bb12] : (i64) -> () | |
| ^bb25: // pred: ^bb12 | |
| %175 = "llvm.add"(%93, %1) : (i64, i64) -> i64 | |
| "llvm.br"(%175)[^bb10] : (i64) -> () | |
| ^bb26: // pred: ^bb10 | |
| %176 = "llvm.add"(%91, %1) : (i64, i64) -> i64 | |
| "llvm.br"(%176)[^bb9] : (i64) -> () | |
| ^bb27: // pred: ^bb9 | |
| %177 = "llvm.add"(%49, %48) : (i64, i64) -> i64 | |
| "llvm.br"(%177)[^bb3] : (i64) -> () | |
| ^bb28: // pred: ^bb3 | |
| %178 = "llvm.add"(%45, %44) : (i64, i64) -> i64 | |
| "llvm.br"(%178)[^bb1] : (i64) -> () | |
| ^bb29: // pred: ^bb1 | |
| %179 = "llvm.mlir.constant"() {value = 0 : i32} : () -> i32 | |
| "llvm.return"(%179) : (i32) -> () | |
| }) {linkage = #llvm.linkage<internal>, sym_name = "matmul_test_dispatch_2", sym_visibility = "private", type = !llvm.func<i32 (ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, ptr<array<3 x i32>>, ptr<i8>)>} : () -> () | |
| }) {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_2", sym_visibility = "private"} : () -> () | |
| "hal.executable"() ( { | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io"} : () -> () | |
| "hal.executable.variant"() ( { | |
| "hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> () | |
| "builtin.module"() ( { | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32 | |
| %1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32 | |
| %2 = "std.constant"() {value = 10 : index} : () -> index | |
| %3 = "std.constant"() {value = 0 : index} : () -> index | |
| %4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32> | |
| %6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32> | |
| %7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index | |
| %8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index | |
| %9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index | |
| %11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index | |
| %13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%13, %2, %14) ( { | |
| ^bb0(%arg0: index): // no predecessors | |
| %15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| %16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index | |
| "scf.for"(%15, %2, %16) ( { | |
| ^bb0(%arg1: index): // no predecessors | |
| %17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32> | |
| %19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32> | |
| %21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index | |
| %23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index | |
| %28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32> | |
| %29 = "linalg.generic"(%25, %28) ( { | |
| ^bb0(%arg2: f32, %arg3: f32): // no predecessors | |
| %31 = "linalg.index"() {dim = 0 : i64} : () -> index | |
| %32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %33 = "linalg.index"() {dim = 1 : i64} : () -> index | |
| %34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index | |
| %35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1 | |
| %36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32 | |
| "linalg.yield"(%36) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| %30 = "linalg.generic"(%18, %20, %29) ( { | |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors | |
| %31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32 | |
| %32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32 | |
| "linalg.yield"(%32) : (f32) -> () | |
| }) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> | |
| "flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "std.return"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> () | |
| "hal.interface"() ( { | |
| "hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> () | |
| "hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> () | |
| "hal.interface_end"() : () -> () | |
| }) {sym_name = "io", sym_visibility = "private"} : () -> () | |
| }) : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> () | |
| "builtin.func"() ( { | |
| %0 = "std.constant"() {value = 10 : index} : () -> index | |
| %1:2 = "flow.ex.stream.fragment"(%0, %0, %0, %0) ( { | |
| %2 = "std.constant"() {value = 1 : index} : () -> index | |
| %3 = "std.constant"() {value = 10 : index} : () -> index | |
| %4 = "flow.dispatch"(%3, %3, %2) {entry_point = @matmul_test_dispatch_0::@matmul_test_dispatch_0, hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 0, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index) -> tensor<10x10xf32> | |
| %5 = "flow.dispatch"(%3, %3, %2, %4, %4) {entry_point = @matmul_test_dispatch_2::@matmul_test_dispatch_2, hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 2, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %6 = "flow.tensor.reshape"(%5, %3, %3) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<10x10xf32>, index, index) -> tensor<?x?xf32> | |
| %7 = "flow.dispatch"(%3, %3, %2, %4, %4) {entry_point = @matmul_test_dispatch_3::@matmul_test_dispatch_3, hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 2, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> | |
| %8 = "flow.tensor.reshape"(%7, %3, %3) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<10x10xf32>, index, index) -> tensor<?x?xf32> | |
| "flow.return"(%6, %8) : (tensor<?x?xf32>, tensor<?x?xf32>) -> () | |
| }) {operand_segment_sizes = dense<[0, 0, 4]> : vector<3xi32>, tied_operands = []} : (index, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>) | |
| "check.expect_eq"(%1#0, %1#1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> () | |
| "std.return"() : () -> () | |
| }) {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}, sym_name = "matmul_test", type = () -> ()} : () -> () | |
| }) {hal.device.targets = [#hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>]} : () -> () | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment