pashu123 · February 11, 2025 15:02
diff --git a/compiles.mlir b/compiles.mlir
 hal.executable public @run_forward$async_dispatch_1023 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
    hal.executable.export public @run_forward$async_dispatch_1023_elementwise_2x32x60x3840_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @run_forward$async_dispatch_1023_elementwise_2x32x60x3840_f16xf32xf32xf32() {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 2.304000e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %c85483008 = arith.constant 85483008 : index
        %c165416448 = arith.constant 165416448 : index
        %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>>
        %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c165416448) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>> -> tensor<2x32x60x3840xf16>
        %3 = tensor.empty() : tensor<2x32x60x3840xf32>
        %4 = tensor.empty() : tensor<2x32xf32>
        %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<2x32x60x3840xf16>) outs(%3 : tensor<2x32x60x3840xf32>) {
        ^bb0(%in: f16, %out: f32):
          %11 = arith.extf %in : f16 to f32
          linalg.yield %11 : f32
        } -> tensor<2x32x60x3840xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x60x3840xf32>) outs(%6 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %11 = arith.addf %in, %out : f32
          linalg.yield %11 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %11 = arith.divf %in, %cst_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5, %8 : tensor<2x32x60x3840xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %11 = arith.subf %in, %in_2 : f32
          %12 = arith.mulf %11, %11 : f32
          %13 = arith.addf %12, %out : f32
          linalg.yield %13 : f32
        } -> tensor<2x32xf32>
        %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2, %8, %9 : tensor<2x32x60x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%3 : tensor<2x32x60x3840xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %11 = arith.divf %in_3, %cst_0 : f32
          %12 = arith.addf %11, %cst_1 : f32
          %13 = math.rsqrt %12 : f32
          %14 = arith.extf %in : f16 to f32
          %15 = arith.subf %14, %in_2 : f32
          %16 = arith.mulf %15, %13 : f32
          linalg.yield %16 : f32
        } -> tensor<2x32x60x3840xf32>
        flow.dispatch.tensor.store %10, %1, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : tensor<2x32x60x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
        return
      }
    }
  }
 }
	hal.executable public @run_forward$async_dispatch_1023 {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64\|fp32\|fp16\|int64\|int32\|int16\|int8, storage = b64\|b32\|b16\|b8, subgroup = shuffle\|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
	hal.executable.export public @run_forward$async_dispatch_1023_elementwise_2x32x60x3840_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @run_forward$async_dispatch_1023_elementwise_2x32x60x3840_f16xf32xf32xf32() {
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant 2.304000e+05 : f32
	%cst_1 = arith.constant 9.99999974E-6 : f32
	%c85483008 = arith.constant 85483008 : index
	%c165416448 = arith.constant 165416448 : index
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>>
	%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c165416448) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
	%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x60x3840xf16>> -> tensor<2x32x60x3840xf16>
	%3 = tensor.empty() : tensor<2x32x60x3840xf32>
	%4 = tensor.empty() : tensor<2x32xf32>
	%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<2x32x60x3840xf16>) outs(%3 : tensor<2x32x60x3840xf32>) {
	^bb0(%in: f16, %out: f32):
	%11 = arith.extf %in : f16 to f32
	linalg.yield %11 : f32
	} -> tensor<2x32x60x3840xf32>
	%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
	%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x60x3840xf32>) outs(%6 : tensor<2x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%11 = arith.addf %in, %out : f32
	linalg.yield %11 : f32
	} -> tensor<2x32xf32>
	%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%11 = arith.divf %in, %cst_0 : f32
	linalg.yield %11 : f32
	} -> tensor<2x32xf32>
	%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5, %8 : tensor<2x32x60x3840xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
	^bb0(%in: f32, %in_2: f32, %out: f32):
	%11 = arith.subf %in, %in_2 : f32
	%12 = arith.mulf %11, %11 : f32
	%13 = arith.addf %12, %out : f32
	linalg.yield %13 : f32
	} -> tensor<2x32xf32>
	%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2, %8, %9 : tensor<2x32x60x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%3 : tensor<2x32x60x3840xf32>) {
	^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
	%11 = arith.divf %in_3, %cst_0 : f32
	%12 = arith.addf %11, %cst_1 : f32
	%13 = math.rsqrt %12 : f32
	%14 = arith.extf %in : f16 to f32
	%15 = arith.subf %14, %in_2 : f32
	%16 = arith.mulf %15, %13 : f32
	linalg.yield %16 : f32
	} -> tensor<2x32x60x3840xf32>
	flow.dispatch.tensor.store %10, %1, offsets = [0, 0, 0, 0], sizes = [2, 32, 60, 3840], strides = [1, 1, 1, 1] : tensor<2x32x60x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x60x3840xf32>>
	return
	}
	}
	}
	}