Created
January 7, 2025 16:53
-
-
Save pashu123/4bc9e36938a7389e6c7cee23872de3a8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0, d1, d2) -> (d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map3 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#nested = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]> | |
#nested1 = #iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]> | |
#nested2 = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}> | |
module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #translation} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<2x4096x10x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) { | |
gpu.barrier | |
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#nested) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 {__inplace_operands_attr__ = ["true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_3 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#nested) : vector<64x64xf16> | |
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#nested) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = vector.transfer_write %33, %36[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39 = vector.transfer_write %35, %38[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%40:2 = iree_gpu.value_barrier %37, %39 {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%41 = vector.transfer_read %40#0[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%42 = iree_vector_ext.to_layout %41 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%43 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%44 = iree_vector_ext.to_layout %43 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%45 = vector.multi_reduction <maximumf>, %44, %arg5 [1] : vector<64x64xf32> to vector<64xf32> | |
%46 = arith.subf %arg5, %45 : vector<64xf32> | |
%47 = math.exp2 %46 : vector<64xf32> | |
%48 = arith.mulf %47, %arg6 : vector<64xf32> | |
%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%51 = arith.subf %44, %50 : vector<64x64xf32> | |
%52 = math.exp2 %51 : vector<64x64xf32> | |
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32> | |
%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16> | |
%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32> | |
%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%57 = arith.mulf %56, %arg7 : vector<64x64xf32> | |
%58 = vector.transfer_read %40#1[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%59 = iree_vector_ext.to_layout %58 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %54 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %57 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%62 = vector.contract {indexing_maps = [#map3, #map2, #map1], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%63 = iree_vector_ext.to_layout %62 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %17[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none", "none", "none"]} : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true", "none", "none", "none"]} : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16> | |
} | |
} {__inplace_operands_attr__ = ["true"], mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none"]} : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment