bjacob · March 5, 2021 19:02
diff --git a/gistfile1.txt b/gistfile1.txt
 ```
 benoitjacob@benoitjacob:/google/src/cloud/benoitjacob/fig1/google3$ blaze-bin/third_party/iree/experimental/runners/mlir-proto-opt -linalg-comprehensive-bufferize-inplace -print-ir-after-all -mlir-disable-threading /tmp/a.mlir
 // *** IR Dump After Canonicalizer ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After CSE ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After Canonicalizer ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After CSE ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @generate_pseudorandom_4d_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> tensor<?x?x?x?xf32> {
  %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
  ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
    %1 = index_cast %arg4 : index to i32
    %2 = sitofp %1 : i32 to f32
    tensor.yield %2 : f32
  } : tensor<?x?x?x?xf32>
  return %0 : tensor<?x?x?x?xf32>
 }

 // *** IR Dump After Canonicalizer ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After CSE ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After Canonicalizer ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After CSE ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<3x4x4x2xf32>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %6 = index_cast %5 : index to i32
    %7 = sitofp %6 : i32 to f32
    tensor.yield %7 : f32
  } : tensor<2x3x8x4xf32>
  %3 = linalg.mmt_4d_kernel ins(%0, %1 : tensor<?x?x?x?xf32>, tensor<3x4x4x2xf32>) outs(%2 : tensor<2x3x8x4xf32>) -> tensor<2x3x8x4xf32>
  %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : tensor<2x3x8x4xf32>, vector<2x3x8x4xf32>
  vector.print %4 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After Canonicalizer ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After CSE ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After Canonicalizer ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After CSE ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 // *** IR Dump After LoopInvariantCodeMotion ***
 func @main() {
  %c8 = constant 8 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = call @generate_pseudorandom_4d_f32(%c2, %c8, %c4, %c2) : (index, index, index, index) -> tensor<?x?x?x?xf32>
  %1 = tensor_to_memref %0 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %2 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<3x4x4x2xf32>
  %3 = tensor_to_memref %2 : memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  %4 = tensor.generate   {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %7 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>(%arg0, %arg1, %arg2, %arg3)
    %8 = index_cast %7 : index to i32
    %9 = sitofp %8 : i32 to f32
    tensor.yield %9 : f32
  } : tensor<2x3x8x4xf32>
  %5 = tensor_to_memref %4 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
  linalg.mmt_4d_kernel ins(%1, %3 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<3x4x4x2xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%5 : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
  %6 = vector.transfer_read %5[%c0, %c0, %c0, %c0], %cst {masked = [false, false, false, false]} : memref<2x3x8x4xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, vector<2x3x8x4xf32>
  vector.print %6 : vector<2x3x8x4xf32>
  return
 }

 return val does not fold: %0 = tensor.generate %arg0, %arg1, %arg2, %arg3  {
 ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
  %1 = index_cast %arg4 : index to i32
  %2 = sitofp %1 : i32 to f32
  tensor.yield %2 : f32
 } : tensor<?x?x?x?xf32>
 /tmp/a.mlir:26:10: error: 'std.tensor_load' op requires the same shape for all operands and results
  %lhs = call @generate_pseudorandom_4d_f32 (%M, %M0, %K, %K0) : (index, index, index, index) -> tensor<?x?x?x?xf32>
         ^
 /tmp/a.mlir:26:10: note: see current operation: %1 = "std.tensor_load"(%0) : (tensor<?x?x?x?xf32>) -> none
 /tmp/a.mlir:26:10: error: 'std.tensor_load' op requires the same shape for all operands and results
  %lhs = call @generate_pseudorandom_4d_f32 (%M, %M0, %K, %K0) : (index, index, index, index) -> tensor<?x?x?x?xf32>
         ^
 /tmp/a.mlir:26:10: note: see current operation: %1 = "std.tensor_load"(%0) : (tensor<?x?x?x?xf32>) -> none
 // *** IR Dump After (anonymous namespace)::LinalgComprehensiveBufferizePass Failed ***
 #map0 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
 #map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 "module"() ( {
  "func"() ( {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
    %0 = "tensor.generate"(%arg0, %arg1, %arg2, %arg3) ( {
    ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):  // no predecessors
      %1 = "std.index_cast"(%arg4) : (index) -> i32
      %2 = "std.sitofp"(%1) : (i32) -> f32
      "tensor.yield"(%2) : (f32) -> ()
    }) : (index, index, index, index) -> tensor<?x?x?x?xf32>
    "std.return"(%0) : (tensor<?x?x?x?xf32>) -> ()
  }) {sym_name = "generate_pseudorandom_4d_f32", type = (index, index, index, index) -> tensor<?x?x?x?xf32>} : () -> ()
  "func"() ( {
    %c8 = "std.constant"() {value = 8 : index} : () -> index
    %c4 = "std.constant"() {value = 4 : index} : () -> index
    %c2 = "std.constant"() {value = 2 : index} : () -> index
    %c0 = "std.constant"() {value = 0 : index} : () -> index
    %cst = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
    %0 = "std.call"(%c2, %c8, %c4, %c2) {callee = @generate_pseudorandom_4d_f32} : (index, index, index, index) -> tensor<?x?x?x?xf32>
    %1 = "std.tensor_load"(%0) : (tensor<?x?x?x?xf32>) -> none
    %2 = "std.tensor_to_memref"(%1) : (none) -> memref<?x?x?x?xf32, #map0>
    %3 = "tensor.generate"() ( {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
      %8 = "affine.apply"(%arg0, %arg1, %arg2, %arg3) {map = #map1} : (index, index, index, index) -> index
      %9 = "std.index_cast"(%8) : (index) -> i32
      %10 = "std.sitofp"(%9) : (i32) -> f32
      "tensor.yield"(%10) : (f32) -> ()
    }) : () -> tensor<3x4x4x2xf32>
    %4 = "std.tensor_to_memref"(%3) : (tensor<3x4x4x2xf32>) -> memref<3x4x4x2xf32, #map0>
    %5 = "tensor.generate"() ( {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):  // no predecessors
      %8 = "affine.apply"(%arg0, %arg1, %arg2, %arg3) {map = #map1} : (index, index, index, index) -> index
      %9 = "std.index_cast"(%8) : (index) -> i32
      %10 = "std.sitofp"(%9) : (i32) -> f32
      "tensor.yield"(%10) : (f32) -> ()
    }) : () -> tensor<2x3x8x4xf32>
    %6 = "std.tensor_to_memref"(%5) : (tensor<2x3x8x4xf32>) -> memref<2x3x8x4xf32, #map0>
    "linalg.mmt_4d_kernel"(%2, %4, %6) ( {
    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):  // no predecessors
      %8 = "std.mulf"(%arg0, %arg1) : (f32, f32) -> f32
      %9 = "std.addf"(%arg2, %8) : (f32, f32) -> f32
      "linalg.yield"(%9) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [#map2, #map3, #map4], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (memref<?x?x?x?xf32, #map0>, memref<3x4x4x2xf32, #map0>, memref<2x3x8x4xf32, #map0>) -> ()
    %7 = "vector.transfer_read"(%6, %c0, %c0, %c0, %c0, %cst) {masked = [false, false, false, false], permutation_map = #map5} : (memref<2x3x8x4xf32, #map0>, index, index, index, index, f32) -> vector<2x3x8x4xf32>
    "vector.print"(%7) : (vector<2x3x8x4xf32>) -> ()
    "std.dealloc"(%0) : (tensor<?x?x?x?xf32>) -> ()
    "std.return"() : () -> ()
  }) {sym_name = "main", type = () -> ()} : () -> ()
  "module_terminator"() : () -> ()
 }) : () -> ()

 ```