Created
May 13, 2025 22:59
-
-
Save pashu123/2951e0e0c3bf1e1ce8ca84894f4da4be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func.func @prefill_bs4$async_dispatch_20_elementwise_4xDx4096_bf16xf32xf32xf32xf32xf8E4M3FNUZ() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c32 = arith.constant 32 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%c2_i64 = arith.constant 2 : i64 | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 4.096000e+03 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%cst_2 = arith.constant -2.400000e+02 : f32 | |
%cst_3 = arith.constant 2.400000e+02 : f32 | |
%c524288 = arith.constant 524288 : index | |
%c1092624896 = arith.constant 1092624896 : index | |
%c1092633088 = arith.constant 1092633088 : index | |
%0 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = arith.extui %0 : i32 to i64 | |
%10 = arith.extui %1 : i32 to i64 | |
%11 = arith.shli %10, %c32_i64 : i64 | |
%12 = arith.ori %9, %11 : i64 | |
%13 = arith.index_castui %12 : i64 to index | |
%14 = arith.extui %2 : i32 to i64 | |
%15 = arith.extui %3 : i32 to i64 | |
%16 = arith.shli %15, %c32_i64 : i64 | |
%17 = arith.ori %14, %16 : i64 | |
%18 = arith.index_castui %17 : i64 to index | |
%19 = arith.extui %4 : i32 to i64 | |
%20 = arith.extui %5 : i32 to i64 | |
%21 = arith.shli %20, %c32_i64 : i64 | |
%22 = arith.ori %19, %21 : i64 | |
%23 = arith.index_castui %22 : i64 to index | |
%24 = arith.extui %6 : i32 to i64 | |
%25 = arith.extui %7 : i32 to i64 | |
%26 = arith.shli %25, %c32_i64 : i64 | |
%27 = arith.ori %24, %26 : i64 | |
%28 = arith.index_castui %27 : i64 to index | |
%29 = arith.index_castui %8 : i32 to index | |
%30:5 = util.assume.int | |
%13<umin = 72369280, umax = 38775538688>, | |
%18<umin = 74466432, umax = 47363376128>, | |
%23<umin = 76563584, umax = 55951213568>, | |
%28<umin = 69206016, umax = 8654946304>, | |
%29<umin = 32, umax = 131040, udiv = 32> | |
: index, index, index, index, index | |
%31 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c524288) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> | |
%32 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092624896) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>> | |
%33 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092633088) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> | |
%34 = iree_tensor_ext.dispatch.workload.ordinal %30#4, 0 : index | |
%35 = arith.divsi %34, %c32 : index | |
%36 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c67108864) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35} | |
%37 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%30#0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35} | |
%38 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%30#1) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35} | |
%39 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(4) alignment(64) offset(%30#2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35} | |
%40 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(5) alignment(64) offset(%30#3) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35} | |
%41 = iree_tensor_ext.dispatch.tensor.load %31, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> | |
%42 = iree_tensor_ext.dispatch.tensor.load %32, offsets = [0], sizes = [4096], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>> -> tensor<4096xbf16> | |
%43 = iree_tensor_ext.dispatch.tensor.load %33, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> | |
%44 = iree_tensor_ext.dispatch.tensor.load %36, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35} -> tensor<4x?x32x4096xbf16> | |
%45 = iree_tensor_ext.dispatch.tensor.load %37, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35} -> tensor<4x?x32x4096xf32> | |
%46 = affine.apply affine_map<()[s0] -> (s0 floordiv 32)>()[%34] | |
%47 = tensor.empty(%46) : tensor<4x?x32x4096xf32> | |
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%47 : tensor<4x?x32x4096xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} { | |
^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32): | |
%54 = arith.mulf %in_4, %in_5 : f32 | |
%55 = arith.extf %in : bf16 to f32 | |
%56 = arith.addf %55, %54 : f32 | |
linalg.yield %56 : f32 | |
} -> tensor<4x?x32x4096xf32> | |
%49 = tensor.empty(%46) : tensor<4x?x32xf32> | |
%50 = linalg.fill ins(%cst : f32) outs(%49 : tensor<4x?x32xf32>) -> tensor<4x?x32xf32> | |
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%50 : tensor<4x?x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 1, 0]}>} { | |
^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32): | |
%54 = arith.mulf %in_4, %in_5 : f32 | |
%55 = arith.extf %in : bf16 to f32 | |
%56 = arith.addf %55, %54 : f32 | |
%57 = math.fpowi %56, %c2_i64 : f32, i64 | |
%58 = arith.addf %57, %out : f32 | |
linalg.yield %58 : f32 | |
} -> tensor<4x?x32xf32> | |
%52 = tensor.empty(%46) : tensor<4x?x32x4096xf8E4M3FNUZ> | |
%53:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%42, %44, %45, %41, %51, %43 : tensor<4096xbf16>, tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>, tensor<4x?x32xf32>, tensor<f32>) outs(%47, %52 : tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} { | |
^bb0(%in: bf16, %in_4: bf16, %in_5: f32, %in_6: f32, %in_7: f32, %in_8: f32, %out: f32, %out_9: f8E4M3FNUZ): | |
%54 = arith.mulf %in_5, %in_6 : f32 | |
%55 = arith.extf %in_4 : bf16 to f32 | |
%56 = arith.addf %55, %54 : f32 | |
%57 = arith.divf %in_7, %cst_0 : f32 | |
%58 = arith.addf %57, %cst_1 : f32 | |
%59 = math.rsqrt %58 : f32 | |
%60 = arith.mulf %56, %59 : f32 | |
%61 = arith.extf %in : bf16 to f32 | |
%62 = arith.mulf %61, %60 : f32 | |
%63 = arith.divf %62, %in_8 : f32 | |
%64 = arith.cmpf ult, %63, %cst_2 : f32 | |
%65 = arith.select %64, %cst_2, %63 : f32 | |
%66 = arith.cmpf ugt, %65, %cst_3 : f32 | |
%67 = arith.select %66, %cst_3, %65 : f32 | |
%68 = arith.truncf %67 : f32 to f8E4M3FNUZ | |
linalg.yield %62, %68 : f32, f8E4M3FNUZ | |
} -> (tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>) | |
iree_tensor_ext.dispatch.tensor.store %48, %38, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35} | |
iree_tensor_ext.dispatch.tensor.store %53#0, %39, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35} | |
iree_tensor_ext.dispatch.tensor.store %53#1, %40, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf8E4M3FNUZ> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35} | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment