pashu123 · February 21, 2025 20:08
diff --git a/test_this_now.mlir b/test_this_now.mlir
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 hal.executable private @encode {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
    hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
 builtin.module {
  func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
    %cst = arith.constant 1.280000e+03 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
    %2 = arith.index_castui %0 : i32 to index
    %3 = arith.index_castui %1 : i32 to index
    %5 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset() flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<64x1280xf16>>
    %6 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset() flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<64xf32>>
    %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280xf16>> -> tensor<64x1280xf16>
    %8 = tensor.empty() : tensor<64xf32>
    %9 = tensor.empty() : tensor<64x1280xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<64x1280xf16>) outs(%9 : tensor<64x1280xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1], [0, 1]], thread = [0, 8], thread_basis = [[64, 32], [0, 1]], workgroup = [64, 0]}>} {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<64x1280xf32>
    %11 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<64xf32>) -> tensor<64xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<64x1280xf32>) outs(%11 : tensor<64xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 256], subgroup_basis = [[1, 1], [0, 1]], thread = [0, 8], thread_basis = [[64, 32], [0, 1]], workgroup = [64, 0]}>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<64xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<64xf32>) outs(%8 : tensor<64xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1], [0]], thread = [0], thread_basis = [[64], [0]], workgroup = [64]}>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst : f32
      linalg.yield %14 : f32
    } -> tensor<64xf32>
    flow.dispatch.tensor.store %13, %6, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:tensor<64xf32>>
    return
  }
  }
  }
  }
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	hal.executable private @encode {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
	hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
	%cst = arith.constant 1.280000e+03 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = arith.index_castui %0 : i32 to index
	%3 = arith.index_castui %1 : i32 to index
	%5 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset() flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<64x1280xf16>>
	%6 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset() flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<64xf32>>
	%7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280xf16>> -> tensor<64x1280xf16>
	%8 = tensor.empty() : tensor<64xf32>
	%9 = tensor.empty() : tensor<64x1280xf32>
	%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<64x1280xf16>) outs(%9 : tensor<64x1280xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1], [0, 1]], thread = [0, 8], thread_basis = [[64, 32], [0, 1]], workgroup = [64, 0]}>} {
	^bb0(%in: f16, %out: f32):
	%14 = arith.extf %in : f16 to f32
	linalg.yield %14 : f32
	} -> tensor<64x1280xf32>
	%11 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<64xf32>) -> tensor<64xf32>
	%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<64x1280xf32>) outs(%11 : tensor<64xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 256], subgroup_basis = [[1, 1], [0, 1]], thread = [0, 8], thread_basis = [[64, 32], [0, 1]], workgroup = [64, 0]}>} {
	^bb0(%in: f32, %out: f32):
	%14 = arith.addf %in, %out : f32
	linalg.yield %14 : f32
	} -> tensor<64xf32>
	%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<64xf32>) outs(%8 : tensor<64xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1], [0]], thread = [0], thread_basis = [[64], [0]], workgroup = [64]}>} {
	^bb0(%in: f32, %out: f32):
	%14 = arith.divf %in, %cst : f32
	linalg.yield %14 : f32
	} -> tensor<64xf32>
	flow.dispatch.tensor.store %13, %6, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:tensor<64xf32>>
	return
	}
	}
	}
	}