Created
June 4, 2025 18:08
-
-
Save pashu123/36e462b99fa04749e2c653cab18c24ed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0)[s0] -> (-d0 + s0, 32)> | |
#map1 = affine_map<()[s0, s1] -> (s0 + s1)> | |
#map2 = affine_map<(d0) -> (d0)> | |
#map3 = affine_map<(d0) -> ()> | |
#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]> | |
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [4, 1, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}> | |
module { | |
func.func @dynamic_softmax() attributes {translation_info = #translation} { | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant dense<0.000000e+00> : vector<32xf16> | |
%cst_0 = arith.constant dense<0xFE00> : vector<32xf16> | |
%c32 = arith.constant 32 : index | |
%cst_1 = arith.constant 0.000000e+00 : f16 | |
%cst_2 = arith.constant 0xFE00 : f16 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<f16, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<f16, #gpu.address_space<workgroup>> | |
%alloc_4 = memref.alloc() : memref<1x32xf16, #gpu.address_space<workgroup>> | |
%alloc_5 = memref.alloc() : memref<1x32xf16, #gpu.address_space<workgroup>> | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = arith.extui %0 : i32 to i64 | |
%3 = arith.extui %1 : i32 to i64 | |
%4 = arith.shli %3, %c32_i64 : i64 | |
%5 = arith.ori %2, %4 : i64 | |
%6 = arith.index_castui %5 : i64 to index | |
%7 = iree_tensor_ext.dispatch.workload.ordinal %6, 0 : index | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>{%7} | |
%assume_align = memref.assume_alignment %8, 64 : memref<32x?xf16, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>{%7} | |
%assume_align_6 = memref.assume_alignment %9, 64 : memref<32x?xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) in (32) { | |
vector.transfer_write %cst_0, %alloc_4[%c0, %c0] {in_bounds = [true]} : vector<32xf16>, memref<1x32xf16, #gpu.address_space<workgroup>> | |
scf.for %arg1 = %c0 to %7 step %c32 { | |
%16 = affine.min #map(%arg1)[%7] | |
%subview = memref.subview %alloc_4[0, 0] [1, %16] [1, 1] : memref<1x32xf16, #gpu.address_space<workgroup>> to memref<?xf16, strided<[1]>, #gpu.address_space<workgroup>> | |
%17:2 = affine.delinearize_index %thread_id_x into (4) : index, index | |
%18 = arith.subi %16, %c1 : index | |
%19:2 = affine.delinearize_index %18 into (4, 8) : index, index | |
%20 = arith.addi %19#1, %c1 : index | |
%21 = arith.cmpi eq, %17#1, %19#0 : index | |
%22 = arith.cmpi slt, %17#1, %19#0 : index | |
%23 = arith.select %22, %c8, %c0 : index | |
%24 = arith.select %21, %20, %23 : index | |
%25 = vector.create_mask %24 : vector<8xi1> | |
%26 = affine.linearize_index disjoint [%17#1, %c0] by (4, 8) : index | |
%27 = affine.apply #map1()[%arg1, %26] | |
%28 = vector.transfer_read %assume_align[%arg0, %27], %cst_1, %25 {in_bounds = [true]} : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%alloc_7 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %28, %alloc_7[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
%29 = vector.transfer_read %alloc_4[%c0, %26], %cst_1, %25 {in_bounds = [true]} : memref<1x32xf16, #gpu.address_space<workgroup>>, vector<8xf16> | |
%alloc_8 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %29, %alloc_8[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
%alloc_9 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%alloc_7, %alloc_8 : memref<?xf16, #gpu.address_space<workgroup>>, memref<?xf16, #gpu.address_space<workgroup>>) outs(%alloc_9 : memref<?xf16, #gpu.address_space<workgroup>>) { | |
^bb0(%in: f16, %in_11: f16, %out: f16): | |
%31 = arith.maxnumf %in, %in_11 : f16 | |
linalg.yield %31 : f16 | |
} | |
%30 = vector.transfer_read %alloc_9[%26], %cst_1, %25 {in_bounds = [true]} : memref<?xf16, #gpu.address_space<workgroup>>, vector<8xf16> | |
vector.transfer_write %30, %alloc_10[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %alloc_10, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<?xf16, #gpu.address_space<workgroup>> to memref<?xf16, strided<[1]>, #gpu.address_space<workgroup>> | |
gpu.barrier | |
} | |
%10 = vector.transfer_read %alloc_4[%c0, %c0], %cst_1 {in_bounds = [true]} : memref<1x32xf16, #gpu.address_space<workgroup>>, vector<32xf16> | |
%11 = vector.multi_reduction <maxnumf>, %10, %cst_2 [0] : vector<32xf16> to f16 | |
vector.transfer_write %cst, %alloc_5[%c0, %c0] {in_bounds = [true]} : vector<32xf16>, memref<1x32xf16, #gpu.address_space<workgroup>> | |
%12 = vector.broadcast %11 : f16 to vector<f16> | |
vector.transfer_write %12, %alloc_3[] : vector<f16>, memref<f16, #gpu.address_space<workgroup>> | |
scf.for %arg1 = %c0 to %7 step %c32 { | |
%16 = affine.min #map(%arg1)[%7] | |
%subview = memref.subview %alloc_5[0, 0] [1, %16] [1, 1] : memref<1x32xf16, #gpu.address_space<workgroup>> to memref<?xf16, strided<[1]>, #gpu.address_space<workgroup>> | |
%17:2 = affine.delinearize_index %thread_id_x into (4) : index, index | |
%18 = arith.subi %16, %c1 : index | |
%19:2 = affine.delinearize_index %18 into (4, 8) : index, index | |
%20 = arith.addi %19#1, %c1 : index | |
%21 = arith.cmpi eq, %17#1, %19#0 : index | |
%22 = arith.cmpi slt, %17#1, %19#0 : index | |
%23 = arith.select %22, %c8, %c0 : index | |
%24 = arith.select %21, %20, %23 : index | |
%25 = vector.create_mask %24 : vector<8xi1> | |
%26 = affine.linearize_index disjoint [%17#1, %c0] by (4, 8) : index | |
%27 = affine.apply #map1()[%arg1, %26] | |
%28 = vector.transfer_read %assume_align[%arg0, %27], %cst_1, %25 {in_bounds = [true]} : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%alloc_7 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %28, %alloc_7[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
%29 = vector.transfer_read %alloc_5[%c0, %26], %cst_1, %25 {in_bounds = [true]} : memref<1x32xf16, #gpu.address_space<workgroup>>, vector<8xf16> | |
%alloc_8 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %29, %alloc_8[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
%alloc_9 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
linalg.generic {indexing_maps = [#map2, #map3, #map2, #map2], iterator_types = ["parallel"]} ins(%alloc_7, %alloc_3, %alloc_8 : memref<?xf16, #gpu.address_space<workgroup>>, memref<f16, #gpu.address_space<workgroup>>, memref<?xf16, #gpu.address_space<workgroup>>) outs(%alloc_9 : memref<?xf16, #gpu.address_space<workgroup>>) { | |
^bb0(%in: f16, %in_11: f16, %in_12: f16, %out: f16): | |
%31 = arith.subf %in, %in_11 : f16 | |
%32 = math.exp %31 : f16 | |
%33 = arith.addf %32, %in_12 : f16 | |
linalg.yield %33 : f16 | |
} | |
%30 = vector.transfer_read %alloc_9[%26], %cst_1, %25 {in_bounds = [true]} : memref<?xf16, #gpu.address_space<workgroup>>, vector<8xf16> | |
vector.transfer_write %30, %alloc_10[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %alloc_10, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<?xf16, #gpu.address_space<workgroup>> to memref<?xf16, strided<[1]>, #gpu.address_space<workgroup>> | |
gpu.barrier | |
} | |
%13 = vector.transfer_read %alloc_5[%c0, %c0], %cst_1 {in_bounds = [true]} : memref<1x32xf16, #gpu.address_space<workgroup>>, vector<32xf16> | |
%14 = vector.multi_reduction <add>, %13, %cst_1 [0] : vector<32xf16> to f16 | |
%15 = vector.broadcast %14 : f16 to vector<f16> | |
vector.transfer_write %15, %alloc[] : vector<f16>, memref<f16, #gpu.address_space<workgroup>> | |
scf.for %arg1 = %c0 to %7 step %c32 { | |
%16 = affine.min #map(%arg1)[%7] | |
%subview = memref.subview %assume_align_6[%arg0, %arg1] [1, %16] [1, 1] : memref<32x?xf16, #hal.descriptor_type<storage_buffer>> to memref<?xf16, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%17:2 = affine.delinearize_index %thread_id_x into (4) : index, index | |
%18 = arith.subi %16, %c1 : index | |
%19:2 = affine.delinearize_index %18 into (4, 8) : index, index | |
%20 = arith.addi %19#1, %c1 : index | |
%21 = arith.cmpi eq, %17#1, %19#0 : index | |
%22 = arith.cmpi slt, %17#1, %19#0 : index | |
%23 = arith.select %22, %c8, %c0 : index | |
%24 = arith.select %21, %20, %23 : index | |
%25 = vector.create_mask %24 : vector<8xi1> | |
%26 = affine.linearize_index disjoint [%17#1, %c0] by (4, 8) : index | |
%27 = affine.apply #map1()[%arg1, %26] | |
%28 = vector.transfer_read %assume_align[%arg0, %27], %cst_1, %25 {in_bounds = [true]} : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%alloc_7 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %28, %alloc_7[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
%29 = vector.transfer_read %assume_align_6[%arg0, %27], %cst_1, %25 {in_bounds = [true]} : memref<32x?xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%alloc_8 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
%alloc_9 = memref.alloc(%16) : memref<?xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %29, %alloc_8[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel"]} ins(%alloc_7, %alloc_3, %alloc : memref<?xf16, #gpu.address_space<workgroup>>, memref<f16, #gpu.address_space<workgroup>>, memref<f16, #gpu.address_space<workgroup>>) outs(%alloc_8 : memref<?xf16, #gpu.address_space<workgroup>>) { | |
^bb0(%in: f16, %in_10: f16, %in_11: f16, %out: f16): | |
%31 = arith.subf %in, %in_10 : f16 | |
%32 = math.exp %31 : f16 | |
%33 = arith.divf %32, %in_11 : f16 | |
linalg.yield %33 : f16 | |
} | |
%30 = vector.transfer_read %alloc_8[%26], %cst_1, %25 {in_bounds = [true]} : memref<?xf16, #gpu.address_space<workgroup>>, vector<8xf16> | |
vector.transfer_write %30, %alloc_9[%26], %25 {in_bounds = [true]} : vector<8xf16>, memref<?xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %alloc_9, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<?xf16, #gpu.address_space<workgroup>> to memref<?xf16, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
gpu.barrier | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_5 : memref<1x32xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc_4 : memref<1x32xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc_3 : memref<f16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<f16, #gpu.address_space<workgroup>> | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment