Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created March 28, 2025 07:12
Show Gist options
  • Save pashu123/fbb89862642c399381fdc09cee68bfd6 to your computer and use it in GitHub Desktop.
Save pashu123/fbb89862642c399381fdc09cee68bfd6 to your computer and use it in GitHub Desktop.
// -----// IR Dump After CSE (cse) //----- //
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x16xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
%c0 = arith.constant 0 : index
%cst_2 = arith.constant 1.638400e+05 : f32
%cst_3 = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c40 = arith.constant 40 : index
%cst_4 = arith.constant 0.000000e+00 : f16
%cst_5 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>
%cst_6 = arith.constant dense<1.638400e+05> : vector<1x1x4xf32>
%cst_7 = arith.constant dense<9.99999974E-6> : vector<1x1x4xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (2, 32) {
%3 = scf.for %arg2 = %c0 to %c40 step %c1 iter_args(%arg3 = %cst_5) -> (vector<1x1x4xf32>) {
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1)
%32:3 = affine.delinearize_index %thread_id_x into (16, 64) : index, index, index
%33:2 = affine.delinearize_index %thread_id_x into (64) : index, index
%34 = affine.linearize_index [%32#1, %33#1, %31] by (16, 64, 4) : index
%35 = vector.transfer_read %0[%arg0, %arg1, %30#0, %34], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16>
%36 = vector.insert_strided_slice %35, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
%37 = arith.extf %36 : vector<1x1x4xf16> to vector<1x1x4xf32>
%38 = arith.addf %37, %arg3 : vector<1x1x4xf32>
scf.yield %38 : vector<1x1x4xf32>
}
%4 = vector.multi_reduction <add>, %3, %cst_3 [0, 1, 2] : vector<1x1x4xf32> to f32
%5 = gpu.subgroup_reduce add %4 cluster(size = 64) : (f32) -> f32
%6 = vector.insert %5, %cst_0 [0] : f32 into vector<1xf32>
%alloc = memref.alloc() : memref<18xf32, #gpu.address_space<workgroup>>
gpu.barrier
%7:3 = affine.delinearize_index %thread_id_x into (16, 64) : index, index, index
vector.transfer_write %6, %alloc[%7#1] {in_bounds = [true]} : vector<1xf32>, memref<18xf32, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0], %cst_3 {in_bounds = [true]} : memref<18xf32, #gpu.address_space<workgroup>>, vector<16xf32>
%9 = vector.insert_strided_slice %8, %cst {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x16xf32>
%10 = vector.multi_reduction <add>, %9, %cst_3 [0, 1, 2] : vector<1x1x16xf32> to f32
%11 = vector.broadcast %10 : f32 to vector<1xf32>
%12 = arith.addf %11, %cst_0 : vector<1xf32>
%13 = vector.extract %12[0] : f32 from vector<1xf32>
%14 = arith.divf %13, %cst_2 : f32
%15 = vector.broadcast %14 : f32 to vector<1x1x4xf32>
%16 = scf.for %arg2 = %c0 to %c40 step %c1 iter_args(%arg3 = %cst_5) -> (vector<1x1x4xf32>) {
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1)
%32:2 = affine.delinearize_index %thread_id_x into (64) : index, index
%33 = affine.linearize_index [%7#1, %32#1, %31] by (16, 64, 4) : index
%34 = vector.transfer_read %0[%arg0, %arg1, %30#0, %33], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16>
%35 = vector.insert_strided_slice %34, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
%36 = arith.extf %35 : vector<1x1x4xf16> to vector<1x1x4xf32>
%37 = arith.subf %36, %15 : vector<1x1x4xf32>
%38 = arith.mulf %37, %37 : vector<1x1x4xf32>
%39 = arith.addf %38, %arg3 : vector<1x1x4xf32>
scf.yield %39 : vector<1x1x4xf32>
}
%17 = vector.multi_reduction <add>, %16, %cst_3 [0, 1, 2] : vector<1x1x4xf32> to f32
%18 = gpu.subgroup_reduce add %17 cluster(size = 64) : (f32) -> f32
%19 = vector.insert %18, %cst_0 [0] : f32 into vector<1xf32>
%alloc_8 = memref.alloc() : memref<18xf32, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %19, %alloc_8[%7#1] {in_bounds = [true]} : vector<1xf32>, memref<18xf32, #gpu.address_space<workgroup>>
gpu.barrier
%20 = vector.transfer_read %alloc_8[%c0], %cst_3 {in_bounds = [true]} : memref<18xf32, #gpu.address_space<workgroup>>, vector<16xf32>
%21 = vector.insert_strided_slice %20, %cst {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x16xf32>
%22 = vector.multi_reduction <add>, %21, %cst_3 [0, 1, 2] : vector<1x1x16xf32> to f32
%23 = vector.broadcast %22 : f32 to vector<1xf32>
%24 = arith.addf %23, %cst_0 : vector<1xf32>
%25 = vector.extract %24[0] : f32 from vector<1xf32>
%26 = vector.broadcast %25 : f32 to vector<1x1x4xf32>
%27 = arith.divf %26, %cst_6 : vector<1x1x4xf32>
%28 = arith.addf %27, %cst_7 : vector<1x1x4xf32>
%29 = math.rsqrt %28 : vector<1x1x4xf32>
scf.for %arg2 = %c0 to %c40 step %c1 {
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1)
%32:2 = affine.delinearize_index %thread_id_x into (64) : index, index
%33 = affine.linearize_index [%7#1, %32#1, %31] by (16, 64, 4) : index
%34 = vector.transfer_read %1[%arg0, %arg1, %30#0, %33], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16>
%35 = vector.insert_strided_slice %34, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
%36 = arith.extf %35 : vector<1x1x4xf16> to vector<1x1x4xf32>
%37 = arith.subf %36, %15 : vector<1x1x4xf32>
%38 = arith.mulf %37, %29 : vector<1x1x4xf32>
%39 = affine.linearize_index disjoint [%7#1, %32#1, %c0] by (16, 64, 4) : index
%40 = vector.extract %38[0, 0] : vector<4xf32> from vector<1x1x4xf32>
%41 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0)>(%30#1)[%39]
vector.transfer_write %40, %2[%arg0, %arg1, %30#0, %41] {in_bounds = [true]} : vector<4xf32>, memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 64>} {
%c40 = arith.constant 40 : index
%cst = arith.constant dense<9.99999974E-6> : vector<4xf32>
%cst_0 = arith.constant dense<1.638400e+05> : vector<4xf32>
%c0_i32 = arith.constant 0 : i32
%c15 = arith.constant 15 : index
%c64 = arith.constant 64 : index
%c32_i32 = arith.constant 32 : i32
%c16_i32 = arith.constant 16 : i32
%c8_i32 = arith.constant 8 : i32
%c4_i32 = arith.constant 4 : i32
%c2_i32 = arith.constant 2 : i32
%c64_i32 = arith.constant 64 : i32
%c1_i32 = arith.constant 1 : i32
%cst_1 = arith.constant dense<0.000000e+00> : vector<4xf32>
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%cst_3 = arith.constant 0.000000e+00 : f32
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %1, 64 : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %3, 64 : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>>
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %5, 64 : memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>>
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 32 : index
%6 = scf.for %arg0 = %c0 to %c40 step %c1 iter_args(%arg1 = %cst_1) -> (vector<4xf32>) {
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x]
%54 = vector.transfer_read %1[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32>
%56 = arith.addf %55, %arg1 : vector<4xf32>
scf.yield %56 : vector<4xf32>
}
%7 = vector.reduction <add>, %6 : vector<4xf32> into f32
%shuffleResult, %valid = gpu.shuffle xor %7, %c1_i32, %c64_i32 : f32
%8 = arith.addf %7, %shuffleResult : f32
%shuffleResult_4, %valid_5 = gpu.shuffle xor %8, %c2_i32, %c64_i32 : f32
%9 = arith.addf %8, %shuffleResult_4 : f32
%shuffleResult_6, %valid_7 = gpu.shuffle xor %9, %c4_i32, %c64_i32 : f32
%10 = arith.addf %9, %shuffleResult_6 : f32
%shuffleResult_8, %valid_9 = gpu.shuffle xor %10, %c8_i32, %c64_i32 : f32
%11 = arith.addf %10, %shuffleResult_8 : f32
%shuffleResult_10, %valid_11 = gpu.shuffle xor %11, %c16_i32, %c64_i32 : f32
%12 = arith.addf %11, %shuffleResult_10 : f32
%shuffleResult_12, %valid_13 = gpu.shuffle xor %12, %c32_i32, %c64_i32 : f32
%13 = arith.addf %12, %shuffleResult_12 : f32
%alloc = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>>
%14 = arith.divui %thread_id_x, %c64 : index
%15 = arith.remui %thread_id_x, %c64 : index
%16 = arith.cmpi eq, %15, %c0 : index
scf.if %16 {
memref.store %13, %alloc[%14] : memref<16xf32, #gpu.address_space<workgroup>>
}
gpu.barrier
%17 = arith.minui %15, %c15 : index
%18 = memref.load %alloc[%17] : memref<16xf32, #gpu.address_space<workgroup>>
%shuffleResult_14, %valid_15 = gpu.shuffle xor %18, %c1_i32, %c64_i32 : f32
%19 = arith.addf %18, %shuffleResult_14 : f32
%shuffleResult_16, %valid_17 = gpu.shuffle xor %19, %c2_i32, %c64_i32 : f32
%20 = arith.addf %19, %shuffleResult_16 : f32
%shuffleResult_18, %valid_19 = gpu.shuffle xor %20, %c4_i32, %c64_i32 : f32
%21 = arith.addf %20, %shuffleResult_18 : f32
%shuffleResult_20, %valid_21 = gpu.shuffle xor %21, %c8_i32, %c64_i32 : f32
%22 = arith.addf %21, %shuffleResult_20 : f32
%shuffleResult_22, %valid_23 = gpu.shuffle idx %22, %c0_i32, %c64_i32 : f32
%23 = arith.addf %shuffleResult_22, %cst_3 : f32
%24 = vector.broadcast %23 : f32 to vector<4xf32>
%25 = arith.divf %24, %cst_0 : vector<4xf32>
%26 = scf.for %arg0 = %c0 to %c40 step %c1 iter_args(%arg1 = %cst_1) -> (vector<4xf32>) {
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x]
%54 = vector.transfer_read %1[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32>
%56 = arith.subf %55, %25 : vector<4xf32>
%57 = arith.mulf %56, %56 : vector<4xf32>
%58 = arith.addf %57, %arg1 : vector<4xf32>
scf.yield %58 : vector<4xf32>
}
%27 = vector.reduction <add>, %26 : vector<4xf32> into f32
%shuffleResult_24, %valid_25 = gpu.shuffle xor %27, %c1_i32, %c64_i32 : f32
%28 = arith.addf %27, %shuffleResult_24 : f32
%shuffleResult_26, %valid_27 = gpu.shuffle xor %28, %c2_i32, %c64_i32 : f32
%29 = arith.addf %28, %shuffleResult_26 : f32
%shuffleResult_28, %valid_29 = gpu.shuffle xor %29, %c4_i32, %c64_i32 : f32
%30 = arith.addf %29, %shuffleResult_28 : f32
%shuffleResult_30, %valid_31 = gpu.shuffle xor %30, %c8_i32, %c64_i32 : f32
%31 = arith.addf %30, %shuffleResult_30 : f32
%shuffleResult_32, %valid_33 = gpu.shuffle xor %31, %c16_i32, %c64_i32 : f32
%32 = arith.addf %31, %shuffleResult_32 : f32
%shuffleResult_34, %valid_35 = gpu.shuffle xor %32, %c32_i32, %c64_i32 : f32
%33 = arith.addf %32, %shuffleResult_34 : f32
%alloc_36 = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>>
scf.if %16 {
memref.store %33, %alloc_36[%14] : memref<16xf32, #gpu.address_space<workgroup>>
}
gpu.barrier
%34 = memref.load %alloc_36[%17] : memref<16xf32, #gpu.address_space<workgroup>>
%shuffleResult_37, %valid_38 = gpu.shuffle xor %34, %c1_i32, %c64_i32 : f32
%35 = arith.addf %34, %shuffleResult_37 : f32
%shuffleResult_39, %valid_40 = gpu.shuffle xor %35, %c2_i32, %c64_i32 : f32
%36 = arith.addf %35, %shuffleResult_39 : f32
%shuffleResult_41, %valid_42 = gpu.shuffle xor %36, %c4_i32, %c64_i32 : f32
%37 = arith.addf %36, %shuffleResult_41 : f32
%shuffleResult_43, %valid_44 = gpu.shuffle xor %37, %c8_i32, %c64_i32 : f32
%38 = arith.addf %37, %shuffleResult_43 : f32
%shuffleResult_45, %valid_46 = gpu.shuffle idx %38, %c0_i32, %c64_i32 : f32
%39 = arith.addf %shuffleResult_45, %cst_3 : f32
%alloc_47 = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>>
scf.if %16 {
memref.store %13, %alloc_47[%14] : memref<16xf32, #gpu.address_space<workgroup>>
}
gpu.barrier
%40 = memref.load %alloc_47[%17] : memref<16xf32, #gpu.address_space<workgroup>>
%shuffleResult_48, %valid_49 = gpu.shuffle xor %40, %c1_i32, %c64_i32 : f32
%41 = arith.addf %40, %shuffleResult_48 : f32
%shuffleResult_50, %valid_51 = gpu.shuffle xor %41, %c2_i32, %c64_i32 : f32
%42 = arith.addf %41, %shuffleResult_50 : f32
%shuffleResult_52, %valid_53 = gpu.shuffle xor %42, %c4_i32, %c64_i32 : f32
%43 = arith.addf %42, %shuffleResult_52 : f32
%shuffleResult_54, %valid_55 = gpu.shuffle xor %43, %c8_i32, %c64_i32 : f32
%44 = arith.addf %43, %shuffleResult_54 : f32
%shuffleResult_56, %valid_57 = gpu.shuffle idx %44, %c0_i32, %c64_i32 : f32
%45 = arith.addf %shuffleResult_56, %cst_3 : f32
%46 = vector.broadcast %39 : f32 to vector<4xf32>
%47 = vector.broadcast %45 : f32 to vector<4xf32>
%48 = arith.divf %46, %cst_0 : vector<4xf32>
%49 = arith.addf %48, %cst : vector<4xf32>
%50 = math.rsqrt %49 : vector<4xf32>
%51 = arith.divf %47, %cst_0 : vector<4xf32>
scf.for %arg0 = %c0 to %c40 step %c1 {
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x]
%54 = vector.transfer_read %3[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32>
%56 = arith.subf %55, %51 : vector<4xf32>
%57 = arith.mulf %56, %50 : vector<4xf32>
vector.transfer_write %57, %5[%workgroup_id_y, %workgroup_id_x, %52#0, %53] {in_bounds = [true]} : vector<4xf32>, memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>>
}
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment