pashu123 · October 29, 2024 13:44
diff --git a/module_main_graph$async_dispatch_3.mlir b/module_main_graph$async_dispatch_3.mlir
 hal.executable public @main_graph$async_dispatch_3 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_graph$async_dispatch_3_transpose_DxDx128x384_i1 ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_graph$async_dispatch_3_transpose_DxDx128x384_i1() {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %false = arith.constant false
        %0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = arith.extui %0 : i32 to i64
        %5 = arith.extui %1 : i32 to i64
        %6 = arith.shli %5, %c32_i64 : i64
        %7 = arith.ori %4, %6 : i64
        %8 = arith.index_castui %7 : i64 to index
        %9 = arith.extui %2 : i32 to i64
        %10 = arith.extui %3 : i32 to i64
        %11 = arith.shli %10, %c32_i64 : i64
        %12 = arith.ori %9, %11 : i64
        %13 = arith.index_castui %12 : i64 to index
        %14 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x?x128x384xi8>>{%8, %13}
        %15 = tensor.empty(%8, %13) : tensor<?x?x128x384xi1>
        %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%15 : tensor<?x?x128x384xi1>) {
        ^bb0(%out: i1):
          linalg.yield %false : i1
        } -> tensor<?x?x128x384xi1>
        %17 = arith.extui %16 : tensor<?x?x128x384xi1> to tensor<?x?x128x384xi8>
        flow.dispatch.tensor.store %17, %14, offsets = [0, 0, 0, 0], sizes = [%8, %13, 128, 384], strides = [1, 1, 1, 1] : tensor<?x?x128x384xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x128x384xi8>>{%8, %13}
        return
      }
    }
  }
 }
	hal.executable public @main_graph$async_dispatch_3 {
	hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
	hal.executable.export public @main_graph$async_dispatch_3_transpose_DxDx128x384_i1 ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @main_graph$async_dispatch_3_transpose_DxDx128x384_i1() {
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%false = arith.constant false
	%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
	%4 = arith.extui %0 : i32 to i64
	%5 = arith.extui %1 : i32 to i64
	%6 = arith.shli %5, %c32_i64 : i64
	%7 = arith.ori %4, %6 : i64
	%8 = arith.index_castui %7 : i64 to index
	%9 = arith.extui %2 : i32 to i64
	%10 = arith.extui %3 : i32 to i64
	%11 = arith.shli %10, %c32_i64 : i64
	%12 = arith.ori %9, %11 : i64
	%13 = arith.index_castui %12 : i64 to index
	%14 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x?x128x384xi8>>{%8, %13}
	%15 = tensor.empty(%8, %13) : tensor<?x?x128x384xi1>
	%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%15 : tensor<?x?x128x384xi1>) {
	^bb0(%out: i1):
	linalg.yield %false : i1
	} -> tensor<?x?x128x384xi1>
	%17 = arith.extui %16 : tensor<?x?x128x384xi1> to tensor<?x?x128x384xi8>
	flow.dispatch.tensor.store %17, %14, offsets = [0, 0, 0, 0], sizes = [%8, %13, 128, 384], strides = [1, 1, 1, 1] : tensor<?x?x128x384xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x128x384xi8>>{%8, %13}
	return
	}
	}
	}
	}