AmosLewis · August 10, 2024 00:16
diff --git a/module_torch_jit_dispatch_25.mlir b/module_torch_jit_dispatch_25.mlir
 hal.executable public @torch_jit_dispatch_25 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @torch_jit_dispatch_25_quantized_batch_matmul_56x56x128x512_i8xi8xi32xi32xi32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @torch_jit_dispatch_25_quantized_batch_matmul_56x56x128x512_i8xi8xi32xi32xi32() {
        %c0_i32 = arith.constant 0 : i32
        %cst = arith.constant 1.22070313E-4 : f32
        %cst_0 = arith.constant 3.125000e-02 : f32
        %cst_1 = arith.constant 0.000000e+00 : f32
        %cst_2 = arith.constant -1.280000e+02 : f32
        %cst_3 = arith.constant 1.270000e+02 : f32
        %cst_4 = arith.constant 7.812500e-03 : f32
        %c802816 = arith.constant 802816 : index
        %c6078464 = arith.constant 6078464 : index
        %c0 = arith.constant 0 : index
        %c99029440 = arith.constant 99029440 : index
        %c99124928 = arith.constant 99124928 : index
        %c401408 = arith.constant 401408 : index
        %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c802816) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x56x512xi8>>
        %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c6078464) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x512x128xi8>>
        %2 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c99029440) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
        %3 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c99124928) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
        %4 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x56x56xi8>>
        %5 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c401408) : !flow.dispatch.tensor<writeonly:tensor<128x56x56xi8>>
        %6 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [56, 56, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x56x512xi8>> -> tensor<56x56x512xi8>
        %7 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [56, 512, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x512x128xi8>> -> tensor<56x512x128xi8>
        %8 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
        %9 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
        %10 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [128, 56, 56], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x56x56xi8>> -> tensor<128x56x56xi8>
        %11 = tensor.empty() : tensor<128x56x56xi8>
        %12 = tensor.empty() : tensor<56x56x128xi32>
        %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<56x56x128xi32>) -> tensor<56x56x128xi32>
        %14 = linalg.quantized_batch_matmul ins(%6, %7, %c0_i32, %c0_i32 : tensor<56x56x512xi8>, tensor<56x512x128xi8>, i32, i32) outs(%13 : tensor<56x56x128xi32>) -> tensor<56x56x128xi32>
        %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d2, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %9, %14, %10 : tensor<128xf32>, tensor<128xf32>, tensor<56x56x128xi32>, tensor<128x56x56xi8>) outs(%11 : tensor<128x56x56xi8>) {
        ^bb0(%in: f32, %in_5: f32, %in_6: i32, %in_7: i8, %out: i8):
          %16 = arith.sitofp %in_6 : i32 to f32
          %17 = arith.mulf %16, %cst : f32
          %18 = arith.addf %in_5, %17 : f32
          %19 = arith.divf %18, %cst_0 : f32
          %20 = math.roundeven %19 : f32
          %21 = arith.addf %20, %cst_1 : f32
          %22 = arith.maximumf %21, %cst_2 : f32
          %23 = arith.minimumf %22, %cst_3 : f32
          %24 = arith.fptosi %23 : f32 to i8
          %25 = arith.extsi %24 : i8 to i32
          %26 = arith.sitofp %25 : i32 to f32
          %27 = arith.mulf %26, %cst_0 : f32
          %28 = arith.mulf %in, %27 : f32
          %29 = arith.extsi %in_7 : i8 to i32
          %30 = arith.sitofp %29 : i32 to f32
          %31 = arith.mulf %30, %cst_0 : f32
          %32 = arith.divf %28, %cst_4 : f32
          %33 = math.roundeven %32 : f32
          %34 = arith.addf %33, %cst_1 : f32
          %35 = arith.maximumf %34, %cst_2 : f32
          %36 = arith.minimumf %35, %cst_3 : f32
          %37 = arith.fptosi %36 : f32 to i8
          %38 = arith.extsi %37 : i8 to i32
          %39 = arith.sitofp %38 : i32 to f32
          %40 = arith.mulf %39, %cst_4 : f32
          %41 = arith.addf %40, %31 : f32
          %42 = arith.divf %41, %cst_0 : f32
          %43 = math.roundeven %42 : f32
          %44 = arith.addf %43, %cst_1 : f32
          %45 = arith.maximumf %44, %cst_2 : f32
          %46 = arith.minimumf %45, %cst_3 : f32
          %47 = arith.fptosi %46 : f32 to i8
          linalg.yield %47 : i8
        } -> tensor<128x56x56xi8>
        flow.dispatch.tensor.store %15, %5, offsets = [0, 0, 0], sizes = [128, 56, 56], strides = [1, 1, 1] : tensor<128x56x56xi8> -> !flow.dispatch.tensor<writeonly:tensor<128x56x56xi8>>
        return
      }
    }
  }
 }
	hal.executable public @torch_jit_dispatch_25 {
	hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
	hal.executable.export public @torch_jit_dispatch_25_quantized_batch_matmul_56x56x128x512_i8xi8xi32xi32xi32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @torch_jit_dispatch_25_quantized_batch_matmul_56x56x128x512_i8xi8xi32xi32xi32() {
	%c0_i32 = arith.constant 0 : i32
	%cst = arith.constant 1.22070313E-4 : f32
	%cst_0 = arith.constant 3.125000e-02 : f32
	%cst_1 = arith.constant 0.000000e+00 : f32
	%cst_2 = arith.constant -1.280000e+02 : f32
	%cst_3 = arith.constant 1.270000e+02 : f32
	%cst_4 = arith.constant 7.812500e-03 : f32
	%c802816 = arith.constant 802816 : index
	%c6078464 = arith.constant 6078464 : index
	%c0 = arith.constant 0 : index
	%c99029440 = arith.constant 99029440 : index
	%c99124928 = arith.constant 99124928 : index
	%c401408 = arith.constant 401408 : index
	%0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c802816) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x56x512xi8>>
	%1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c6078464) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<56x512x128xi8>>
	%2 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c99029440) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
	%3 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c99124928) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
	%4 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x56x56xi8>>
	%5 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c401408) : !flow.dispatch.tensor<writeonly:tensor<128x56x56xi8>>
	%6 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [56, 56, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x56x512xi8>> -> tensor<56x56x512xi8>
	%7 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [56, 512, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<56x512x128xi8>> -> tensor<56x512x128xi8>
	%8 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
	%9 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
	%10 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [128, 56, 56], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x56x56xi8>> -> tensor<128x56x56xi8>
	%11 = tensor.empty() : tensor<128x56x56xi8>
	%12 = tensor.empty() : tensor<56x56x128xi32>
	%13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<56x56x128xi32>) -> tensor<56x56x128xi32>
	%14 = linalg.quantized_batch_matmul ins(%6, %7, %c0_i32, %c0_i32 : tensor<56x56x512xi8>, tensor<56x512x128xi8>, i32, i32) outs(%13 : tensor<56x56x128xi32>) -> tensor<56x56x128xi32>
	%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d2, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %9, %14, %10 : tensor<128xf32>, tensor<128xf32>, tensor<56x56x128xi32>, tensor<128x56x56xi8>) outs(%11 : tensor<128x56x56xi8>) {
	^bb0(%in: f32, %in_5: f32, %in_6: i32, %in_7: i8, %out: i8):
	%16 = arith.sitofp %in_6 : i32 to f32
	%17 = arith.mulf %16, %cst : f32
	%18 = arith.addf %in_5, %17 : f32
	%19 = arith.divf %18, %cst_0 : f32
	%20 = math.roundeven %19 : f32
	%21 = arith.addf %20, %cst_1 : f32
	%22 = arith.maximumf %21, %cst_2 : f32
	%23 = arith.minimumf %22, %cst_3 : f32
	%24 = arith.fptosi %23 : f32 to i8
	%25 = arith.extsi %24 : i8 to i32
	%26 = arith.sitofp %25 : i32 to f32
	%27 = arith.mulf %26, %cst_0 : f32
	%28 = arith.mulf %in, %27 : f32
	%29 = arith.extsi %in_7 : i8 to i32
	%30 = arith.sitofp %29 : i32 to f32
	%31 = arith.mulf %30, %cst_0 : f32
	%32 = arith.divf %28, %cst_4 : f32
	%33 = math.roundeven %32 : f32
	%34 = arith.addf %33, %cst_1 : f32
	%35 = arith.maximumf %34, %cst_2 : f32
	%36 = arith.minimumf %35, %cst_3 : f32
	%37 = arith.fptosi %36 : f32 to i8
	%38 = arith.extsi %37 : i8 to i32
	%39 = arith.sitofp %38 : i32 to f32
	%40 = arith.mulf %39, %cst_4 : f32
	%41 = arith.addf %40, %31 : f32
	%42 = arith.divf %41, %cst_0 : f32
	%43 = math.roundeven %42 : f32
	%44 = arith.addf %43, %cst_1 : f32
	%45 = arith.maximumf %44, %cst_2 : f32
	%46 = arith.minimumf %45, %cst_3 : f32
	%47 = arith.fptosi %46 : f32 to i8
	linalg.yield %47 : i8
	} -> tensor<128x56x56xi8>
	flow.dispatch.tensor.store %15, %5, offsets = [0, 0, 0], sizes = [128, 56, 56], strides = [1, 1, 1] : tensor<128x56x56xi8> -> !flow.dispatch.tensor<writeonly:tensor<128x56x56xi8>>
	return
	}
	}
	}
	}