pashu123 · May 22, 2025 20:42
diff --git a/test_forall.mlir b/test_forall.mlir
 #executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
 #pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
    %10 = arith.extui %0 : i32 to i64
    %11 = arith.extui %1 : i32 to i64
    %12 = arith.shli %11, %c32_i64 : i64
    %13 = arith.ori %10, %12 : i64
    %14 = arith.index_castui %13 : i64 to index
    %15 = arith.extui %2 : i32 to i64
    %16 = arith.extui %3 : i32 to i64
    %17 = arith.shli %16, %c32_i64 : i64
    %18 = arith.ori %15, %17 : i64
    %19 = arith.index_castui %18 : i64 to index
    %20 = arith.extui %4 : i32 to i64
    %21 = arith.extui %5 : i32 to i64
    %22 = arith.shli %21, %c32_i64 : i64
    %23 = arith.ori %20, %22 : i64
    %24 = arith.index_castui %23 : i64 to index
    %25 = arith.extui %6 : i32 to i64
    %26 = arith.extui %7 : i32 to i64
    %27 = arith.shli %26, %c32_i64 : i64
    %28 = arith.ori %25, %27 : i64
    %29 = arith.index_castui %28 : i64 to index
    %30 = arith.extui %8 : i32 to i64
    %31 = arith.extui %9 : i32 to i64
    %32 = arith.shli %31, %c32_i64 : i64
    %33 = arith.ori %30, %32 : i64
    %34 = arith.index_castui %33 : i64 to index
    %35:5 = util.assume.int
        %14<umin = 0, umax = 9007199254740991>,
        %19<umin = 0, umax = 9007199254740991>,
        %24<umin = 0, umax = 9007199254740991>,
        %29<umin = 0, umax = 9007199254740991>,
        %34<umin = 0, umax = 9007199254740991>
      : index, index, index, index, index
    %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
    %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
    %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
    %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
    %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
    %41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
    %42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
    %43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
    %44 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
    %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
    %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
    %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
    %49 = linalg.fill ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %50 = linalg.mmt4d ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %51 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %47 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%48 : tensor<?x?x16x16xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %52 = arith.addf %in, %in_0 : f32
      %53 = arith.maximumf %52, %cst : f32
      linalg.yield %53 : f32
    } -> tensor<?x?x16x16xf32>
    iree_tensor_ext.dispatch.tensor.store %51, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    return
  }
 }
	#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
	#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	module {
	func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
	%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
	%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
	%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
	%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
	%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
	%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
	%10 = arith.extui %0 : i32 to i64
	%11 = arith.extui %1 : i32 to i64
	%12 = arith.shli %11, %c32_i64 : i64
	%13 = arith.ori %10, %12 : i64
	%14 = arith.index_castui %13 : i64 to index
	%15 = arith.extui %2 : i32 to i64
	%16 = arith.extui %3 : i32 to i64
	%17 = arith.shli %16, %c32_i64 : i64
	%18 = arith.ori %15, %17 : i64
	%19 = arith.index_castui %18 : i64 to index
	%20 = arith.extui %4 : i32 to i64
	%21 = arith.extui %5 : i32 to i64
	%22 = arith.shli %21, %c32_i64 : i64
	%23 = arith.ori %20, %22 : i64
	%24 = arith.index_castui %23 : i64 to index
	%25 = arith.extui %6 : i32 to i64
	%26 = arith.extui %7 : i32 to i64
	%27 = arith.shli %26, %c32_i64 : i64
	%28 = arith.ori %25, %27 : i64
	%29 = arith.index_castui %28 : i64 to index
	%30 = arith.extui %8 : i32 to i64
	%31 = arith.extui %9 : i32 to i64
	%32 = arith.shli %31, %c32_i64 : i64
	%33 = arith.ori %30, %32 : i64
	%34 = arith.index_castui %33 : i64 to index
	%35:5 = util.assume.int
	%14<umin = 0, umax = 9007199254740991>,
	%19<umin = 0, umax = 9007199254740991>,
	%24<umin = 0, umax = 9007199254740991>,
	%29<umin = 0, umax = 9007199254740991>,
	%34<umin = 0, umax = 9007199254740991>
	: index, index, index, index, index
	%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
	%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
	%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
	%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
	%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
	%41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
	%42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
	%43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
	%44 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
	%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
	%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
	%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
	%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
	%49 = linalg.fill ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
	%50 = linalg.mmt4d ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
	%51 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %47 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%48 : tensor<?x?x16x16xf32>) {
	^bb0(%in: f32, %in_0: f32, %out: f32):
	%52 = arith.addf %in, %in_0 : f32
	%53 = arith.maximumf %52, %cst : f32
	linalg.yield %53 : f32
	} -> tensor<?x?x16x16xf32>
	iree_tensor_ext.dispatch.tensor.store %51, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
	return
	}
	}
No results found