pashu123 · February 14, 2025 10:48
diff --git a/module_decode_bs1$async_dispatch_19.mlir b/module_decode_bs1$async_dispatch_19.mlir
 hal.executable public @decode_bs1$async_dispatch_19 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) {
    hal.executable.export public @decode_bs1$async_dispatch_19_elementwise_broadcast_Dx8x4x1x2x64_f8E4M3FNUZ ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @decode_bs1$async_dispatch_19_elementwise_broadcast_Dx8x4x1x2x64_f8E4M3FNUZ() {
        %0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = arith.index_castui %0 : i32 to index
        %5 = arith.index_castui %1 : i32 to index
        %6 = arith.index_castui %2 : i32 to index
        %7 = arith.index_castui %3 : i32 to index
        %8:4 = util.assume.int 
            %4<umin = 67117824, umax = 67314240>, 
            %5<umin = 67183360, umax = 335684160>, 
            %6<umin = 32, umax = 131040, udiv = 32>, 
            %7<umin = 32, umax = 131040, udiv = 32>
          : index, index, index, index
        %9 = flow.dispatch.workload.ordinal %8#2, 0 : index
        %10 = flow.dispatch.workload.ordinal %8#3, 1 : index
        %11 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%8#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x8x1x2x64xf8E4M3FNUZ>>{%9}
        %12 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#1) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8x4x?x1x2x64xf8E4M3FNUZ>>{%10}
        %13 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0, 0], sizes = [%9, 8, 1, 2, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x1x2x64xf8E4M3FNUZ>>{%9} -> tensor<?x8x1x2x64xf8E4M3FNUZ>
        %14 = tensor.empty(%10) : tensor<8x4x?x1x2x64xf8E4M3FNUZ>
        %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d0, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<?x8x1x2x64xf8E4M3FNUZ>) outs(%14 : tensor<8x4x?x1x2x64xf8E4M3FNUZ>) {
        ^bb0(%in: f8E4M3FNUZ, %out: f8E4M3FNUZ):
          linalg.yield %in : f8E4M3FNUZ
        } -> tensor<8x4x?x1x2x64xf8E4M3FNUZ>
        flow.dispatch.tensor.store %15, %12, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 4, %10, 1, 2, 64], strides = [1, 1, 1, 1, 1, 1] : tensor<8x4x?x1x2x64xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<8x4x?x1x2x64xf8E4M3FNUZ>>{%10}
        return
      }
    }
  }
 }
	hal.executable public @decode_bs1$async_dispatch_19 {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64\|fp32\|fp16\|int64\|int32\|int16\|int8, storage = b64\|b32\|b16\|b8, subgroup = shuffle\|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) {
	hal.executable.export public @decode_bs1$async_dispatch_19_elementwise_broadcast_Dx8x4x1x2x64_f8E4M3FNUZ ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @decode_bs1$async_dispatch_19_elementwise_broadcast_Dx8x4x1x2x64_f8E4M3FNUZ() {
	%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
	%4 = arith.index_castui %0 : i32 to index
	%5 = arith.index_castui %1 : i32 to index
	%6 = arith.index_castui %2 : i32 to index
	%7 = arith.index_castui %3 : i32 to index
	%8:4 = util.assume.int
	%4<umin = 67117824, umax = 67314240>,
	%5<umin = 67183360, umax = 335684160>,
	%6<umin = 32, umax = 131040, udiv = 32>,
	%7<umin = 32, umax = 131040, udiv = 32>
	: index, index, index, index
	%9 = flow.dispatch.workload.ordinal %8#2, 0 : index
	%10 = flow.dispatch.workload.ordinal %8#3, 1 : index
	%11 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%8#0) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x8x1x2x64xf8E4M3FNUZ>>{%9}
	%12 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#1) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8x4x?x1x2x64xf8E4M3FNUZ>>{%10}
	%13 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0, 0], sizes = [%9, 8, 1, 2, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x1x2x64xf8E4M3FNUZ>>{%9} -> tensor<?x8x1x2x64xf8E4M3FNUZ>
	%14 = tensor.empty(%10) : tensor<8x4x?x1x2x64xf8E4M3FNUZ>
	%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d0, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<?x8x1x2x64xf8E4M3FNUZ>) outs(%14 : tensor<8x4x?x1x2x64xf8E4M3FNUZ>) {
	^bb0(%in: f8E4M3FNUZ, %out: f8E4M3FNUZ):
	linalg.yield %in : f8E4M3FNUZ
	} -> tensor<8x4x?x1x2x64xf8E4M3FNUZ>
	flow.dispatch.tensor.store %15, %12, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 4, %10, 1, 2, 64], strides = [1, 1, 1, 1, 1, 1] : tensor<8x4x?x1x2x64xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<8x4x?x1x2x64xf8E4M3FNUZ>>{%10}
	return
	}
	}
	}
	}