pashu123 · February 17, 2025 11:49
diff --git a/make.mlir b/make.mlir
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
                                      thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
                                      subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
                                     partial_reduction = [0, 0, 1, 256],
                                     workgroup = [1, 1, 0, 0]
                                     }>
 #config2 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
                                      thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
                                      subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
                                     partial_reduction = [0, 0, 1, 256],
                                     workgroup = [1, 1, 0, 0]
                                     }>
 #config3 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 0, 0],
                                      thread_basis = [[1, 1, 1, 1, 64], [0, 1, 2, 3, 4]],
                                      subgroup_basis = [[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]],
                                     workgroup = [1, 1, 0, 0, 0]
                                     }>
 #translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64>
 hal.executable private @matvec_fp16 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
  builtin.module {
  func.func @test() attributes {translation_info = #translation} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.152000e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %c60267008 = arith.constant 60267008 : index
    %c85483008 = arith.constant 85483008 : index
    %c100228608 = arith.constant 100228608 : index
    %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x30x3840xf16>>
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x60x64xf16>>
    %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x60x64xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x30x3840xf16>> -> tensor<2x32x30x3840xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [32, 30, 2, 60, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x60x64xf16>> -> tensor<32x30x2x60x64xf16>
    %5 = tensor.empty() : tensor<2x32x30x60x64xf32>
    %6 = tensor.empty() : tensor<2x32xf32>
    %7 = tensor.empty() : tensor<2x32x30x3840xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x30x3840xf16>) outs(%7 : tensor<2x32x30x3840xf32>) attrs =  {} {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<2x32x30x3840xf32>
    %9 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x30x3840xf32>) outs(%9 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst_0 : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x30x3840xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs =  {lowering_config = #config2} {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %14 = arith.subf %in, %in_2 : f32
      %15 = arith.mulf %14, %14 : f32
      %16 = arith.addf %15, %out : f32
      linalg.yield %16 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4, %11, %12 : tensor<32x30x2x60x64xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x30x60x64xf32>) attrs = {lowering_config = #config3} {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.addf %14, %cst_1 : f32
      %16 = math.rsqrt %15 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %in_2 : f32
      %19 = arith.mulf %18, %16 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x30x60x64xf32>
    flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 32, 30, 60, 64], strides = [1, 1, 1, 1, 1] : tensor<2x32x30x60x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x60x64xf32>>
    return
  }
  }
  }
  }
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
	#map2 = affine_map<(d0, d1) -> (d0, d1)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
	#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
	thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
	subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
	partial_reduction = [0, 0, 1, 256],
	workgroup = [1, 1, 0, 0]
	}>
	#config2 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
	thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
	subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
	partial_reduction = [0, 0, 1, 256],
	workgroup = [1, 1, 0, 0]
	}>
	#config3 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 0, 0],
	thread_basis = [[1, 1, 1, 1, 64], [0, 1, 2, 3, 4]],
	subgroup_basis = [[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]],
	workgroup = [1, 1, 0, 0, 0]
	}>
	#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64>
	hal.executable private @matvec_fp16 {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
	hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @test() attributes {translation_info = #translation} {
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant 1.152000e+05 : f32
	%cst_1 = arith.constant 9.99999974E-6 : f32
	%c60267008 = arith.constant 60267008 : index
	%c85483008 = arith.constant 85483008 : index
	%c100228608 = arith.constant 100228608 : index
	%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x30x3840xf16>>
	%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x60x64xf16>>
	%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x60x64xf32>>
	%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x30x3840xf16>> -> tensor<2x32x30x3840xf16>
	%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [32, 30, 2, 60, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x60x64xf16>> -> tensor<32x30x2x60x64xf16>
	%5 = tensor.empty() : tensor<2x32x30x60x64xf32>
	%6 = tensor.empty() : tensor<2x32xf32>
	%7 = tensor.empty() : tensor<2x32x30x3840xf32>
	%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x30x3840xf16>) outs(%7 : tensor<2x32x30x3840xf32>) attrs = {} {
	^bb0(%in: f16, %out: f32):
	%14 = arith.extf %in : f16 to f32
	linalg.yield %14 : f32
	} -> tensor<2x32x30x3840xf32>
	%9 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
	%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x30x3840xf32>) outs(%9 : tensor<2x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%14 = arith.addf %in, %out : f32
	linalg.yield %14 : f32
	} -> tensor<2x32xf32>
	%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
	^bb0(%in: f32, %out: f32):
	%14 = arith.divf %in, %cst_0 : f32
	linalg.yield %14 : f32
	} -> tensor<2x32xf32>
	%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x30x3840xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #config2} {
	^bb0(%in: f32, %in_2: f32, %out: f32):
	%14 = arith.subf %in, %in_2 : f32
	%15 = arith.mulf %14, %14 : f32
	%16 = arith.addf %15, %out : f32
	linalg.yield %16 : f32
	} -> tensor<2x32xf32>
	%13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4, %11, %12 : tensor<32x30x2x60x64xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x30x60x64xf32>) attrs = {lowering_config = #config3} {
	^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
	%14 = arith.divf %in_3, %cst_0 : f32
	%15 = arith.addf %14, %cst_1 : f32
	%16 = math.rsqrt %15 : f32
	%17 = arith.extf %in : f16 to f32
	%18 = arith.subf %17, %in_2 : f32
	%19 = arith.mulf %18, %16 : f32
	linalg.yield %19 : f32
	} -> tensor<2x32x30x60x64xf32>
	flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 32, 30, 60, 64], strides = [1, 1, 1, 1, 1] : tensor<2x32x30x60x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x60x64xf32>>
	return
	}
	}
	}
	}