Created
March 28, 2025 07:12
-
-
Save pashu123/fbb89862642c399381fdc09cee68bfd6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1x1x16xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16> | |
%c0 = arith.constant 0 : index | |
%cst_2 = arith.constant 1.638400e+05 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%c40 = arith.constant 40 : index | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%cst_5 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32> | |
%cst_6 = arith.constant dense<1.638400e+05> : vector<1x1x4xf32> | |
%cst_7 = arith.constant dense<9.99999974E-6> : vector<1x1x4xf32> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) in (2, 32) { | |
%3 = scf.for %arg2 = %c0 to %c40 step %c1 iter_args(%arg3 = %cst_5) -> (vector<1x1x4xf32>) { | |
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index | |
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1) | |
%32:3 = affine.delinearize_index %thread_id_x into (16, 64) : index, index, index | |
%33:2 = affine.delinearize_index %thread_id_x into (64) : index, index | |
%34 = affine.linearize_index [%32#1, %33#1, %31] by (16, 64, 4) : index | |
%35 = vector.transfer_read %0[%arg0, %arg1, %30#0, %34], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16> | |
%36 = vector.insert_strided_slice %35, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16> | |
%37 = arith.extf %36 : vector<1x1x4xf16> to vector<1x1x4xf32> | |
%38 = arith.addf %37, %arg3 : vector<1x1x4xf32> | |
scf.yield %38 : vector<1x1x4xf32> | |
} | |
%4 = vector.multi_reduction <add>, %3, %cst_3 [0, 1, 2] : vector<1x1x4xf32> to f32 | |
%5 = gpu.subgroup_reduce add %4 cluster(size = 64) : (f32) -> f32 | |
%6 = vector.insert %5, %cst_0 [0] : f32 into vector<1xf32> | |
%alloc = memref.alloc() : memref<18xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%7:3 = affine.delinearize_index %thread_id_x into (16, 64) : index, index, index | |
vector.transfer_write %6, %alloc[%7#1] {in_bounds = [true]} : vector<1xf32>, memref<18xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0], %cst_3 {in_bounds = [true]} : memref<18xf32, #gpu.address_space<workgroup>>, vector<16xf32> | |
%9 = vector.insert_strided_slice %8, %cst {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x16xf32> | |
%10 = vector.multi_reduction <add>, %9, %cst_3 [0, 1, 2] : vector<1x1x16xf32> to f32 | |
%11 = vector.broadcast %10 : f32 to vector<1xf32> | |
%12 = arith.addf %11, %cst_0 : vector<1xf32> | |
%13 = vector.extract %12[0] : f32 from vector<1xf32> | |
%14 = arith.divf %13, %cst_2 : f32 | |
%15 = vector.broadcast %14 : f32 to vector<1x1x4xf32> | |
%16 = scf.for %arg2 = %c0 to %c40 step %c1 iter_args(%arg3 = %cst_5) -> (vector<1x1x4xf32>) { | |
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index | |
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1) | |
%32:2 = affine.delinearize_index %thread_id_x into (64) : index, index | |
%33 = affine.linearize_index [%7#1, %32#1, %31] by (16, 64, 4) : index | |
%34 = vector.transfer_read %0[%arg0, %arg1, %30#0, %33], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16> | |
%35 = vector.insert_strided_slice %34, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16> | |
%36 = arith.extf %35 : vector<1x1x4xf16> to vector<1x1x4xf32> | |
%37 = arith.subf %36, %15 : vector<1x1x4xf32> | |
%38 = arith.mulf %37, %37 : vector<1x1x4xf32> | |
%39 = arith.addf %38, %arg3 : vector<1x1x4xf32> | |
scf.yield %39 : vector<1x1x4xf32> | |
} | |
%17 = vector.multi_reduction <add>, %16, %cst_3 [0, 1, 2] : vector<1x1x4xf32> to f32 | |
%18 = gpu.subgroup_reduce add %17 cluster(size = 64) : (f32) -> f32 | |
%19 = vector.insert %18, %cst_0 [0] : f32 into vector<1xf32> | |
%alloc_8 = memref.alloc() : memref<18xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
vector.transfer_write %19, %alloc_8[%7#1] {in_bounds = [true]} : vector<1xf32>, memref<18xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%20 = vector.transfer_read %alloc_8[%c0], %cst_3 {in_bounds = [true]} : memref<18xf32, #gpu.address_space<workgroup>>, vector<16xf32> | |
%21 = vector.insert_strided_slice %20, %cst {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x16xf32> | |
%22 = vector.multi_reduction <add>, %21, %cst_3 [0, 1, 2] : vector<1x1x16xf32> to f32 | |
%23 = vector.broadcast %22 : f32 to vector<1xf32> | |
%24 = arith.addf %23, %cst_0 : vector<1xf32> | |
%25 = vector.extract %24[0] : f32 from vector<1xf32> | |
%26 = vector.broadcast %25 : f32 to vector<1x1x4xf32> | |
%27 = arith.divf %26, %cst_6 : vector<1x1x4xf32> | |
%28 = arith.addf %27, %cst_7 : vector<1x1x4xf32> | |
%29 = math.rsqrt %28 : vector<1x1x4xf32> | |
scf.for %arg2 = %c0 to %c40 step %c1 { | |
%30:2 = affine.delinearize_index %arg2 into (10, 4) : index, index | |
%31 = affine.apply affine_map<(d0) -> (d0 * 4096)>(%30#1) | |
%32:2 = affine.delinearize_index %thread_id_x into (64) : index, index | |
%33 = affine.linearize_index [%7#1, %32#1, %31] by (16, 64, 4) : index | |
%34 = vector.transfer_read %1[%arg0, %arg1, %30#0, %33], %cst_4 {in_bounds = [true]} : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>>, vector<4xf16> | |
%35 = vector.insert_strided_slice %34, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16> | |
%36 = arith.extf %35 : vector<1x1x4xf16> to vector<1x1x4xf32> | |
%37 = arith.subf %36, %15 : vector<1x1x4xf32> | |
%38 = arith.mulf %37, %29 : vector<1x1x4xf32> | |
%39 = affine.linearize_index disjoint [%7#1, %32#1, %c0] by (16, 64, 4) : index | |
%40 = vector.extract %38[0, 0] : vector<4xf32> from vector<1x1x4xf32> | |
%41 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0)>(%30#1)[%39] | |
vector.transfer_write %40, %2[%arg0, %arg1, %30#0, %41] {in_bounds = [true]} : vector<4xf32>, memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 64>} { | |
%c40 = arith.constant 40 : index | |
%cst = arith.constant dense<9.99999974E-6> : vector<4xf32> | |
%cst_0 = arith.constant dense<1.638400e+05> : vector<4xf32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c15 = arith.constant 15 : index | |
%c64 = arith.constant 64 : index | |
%c32_i32 = arith.constant 32 : i32 | |
%c16_i32 = arith.constant 16 : i32 | |
%c8_i32 = arith.constant 8 : i32 | |
%c4_i32 = arith.constant 4 : i32 | |
%c2_i32 = arith.constant 2 : i32 | |
%c64_i32 = arith.constant 64 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<4xf32> | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%cst_3 = arith.constant 0.000000e+00 : f32 | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> | |
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<2x32x10x16384xf16, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %3, 64 : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> | |
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x32x10x16384xf32, #hal.descriptor_type<storage_buffer>> to memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %5, 64 : memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>> | |
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 32 : index | |
%6 = scf.for %arg0 = %c0 to %c40 step %c1 iter_args(%arg1 = %cst_1) -> (vector<4xf32>) { | |
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index | |
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x] | |
%54 = vector.transfer_read %1[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16> | |
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32> | |
%56 = arith.addf %55, %arg1 : vector<4xf32> | |
scf.yield %56 : vector<4xf32> | |
} | |
%7 = vector.reduction <add>, %6 : vector<4xf32> into f32 | |
%shuffleResult, %valid = gpu.shuffle xor %7, %c1_i32, %c64_i32 : f32 | |
%8 = arith.addf %7, %shuffleResult : f32 | |
%shuffleResult_4, %valid_5 = gpu.shuffle xor %8, %c2_i32, %c64_i32 : f32 | |
%9 = arith.addf %8, %shuffleResult_4 : f32 | |
%shuffleResult_6, %valid_7 = gpu.shuffle xor %9, %c4_i32, %c64_i32 : f32 | |
%10 = arith.addf %9, %shuffleResult_6 : f32 | |
%shuffleResult_8, %valid_9 = gpu.shuffle xor %10, %c8_i32, %c64_i32 : f32 | |
%11 = arith.addf %10, %shuffleResult_8 : f32 | |
%shuffleResult_10, %valid_11 = gpu.shuffle xor %11, %c16_i32, %c64_i32 : f32 | |
%12 = arith.addf %11, %shuffleResult_10 : f32 | |
%shuffleResult_12, %valid_13 = gpu.shuffle xor %12, %c32_i32, %c64_i32 : f32 | |
%13 = arith.addf %12, %shuffleResult_12 : f32 | |
%alloc = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>> | |
%14 = arith.divui %thread_id_x, %c64 : index | |
%15 = arith.remui %thread_id_x, %c64 : index | |
%16 = arith.cmpi eq, %15, %c0 : index | |
scf.if %16 { | |
memref.store %13, %alloc[%14] : memref<16xf32, #gpu.address_space<workgroup>> | |
} | |
gpu.barrier | |
%17 = arith.minui %15, %c15 : index | |
%18 = memref.load %alloc[%17] : memref<16xf32, #gpu.address_space<workgroup>> | |
%shuffleResult_14, %valid_15 = gpu.shuffle xor %18, %c1_i32, %c64_i32 : f32 | |
%19 = arith.addf %18, %shuffleResult_14 : f32 | |
%shuffleResult_16, %valid_17 = gpu.shuffle xor %19, %c2_i32, %c64_i32 : f32 | |
%20 = arith.addf %19, %shuffleResult_16 : f32 | |
%shuffleResult_18, %valid_19 = gpu.shuffle xor %20, %c4_i32, %c64_i32 : f32 | |
%21 = arith.addf %20, %shuffleResult_18 : f32 | |
%shuffleResult_20, %valid_21 = gpu.shuffle xor %21, %c8_i32, %c64_i32 : f32 | |
%22 = arith.addf %21, %shuffleResult_20 : f32 | |
%shuffleResult_22, %valid_23 = gpu.shuffle idx %22, %c0_i32, %c64_i32 : f32 | |
%23 = arith.addf %shuffleResult_22, %cst_3 : f32 | |
%24 = vector.broadcast %23 : f32 to vector<4xf32> | |
%25 = arith.divf %24, %cst_0 : vector<4xf32> | |
%26 = scf.for %arg0 = %c0 to %c40 step %c1 iter_args(%arg1 = %cst_1) -> (vector<4xf32>) { | |
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index | |
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x] | |
%54 = vector.transfer_read %1[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16> | |
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32> | |
%56 = arith.subf %55, %25 : vector<4xf32> | |
%57 = arith.mulf %56, %56 : vector<4xf32> | |
%58 = arith.addf %57, %arg1 : vector<4xf32> | |
scf.yield %58 : vector<4xf32> | |
} | |
%27 = vector.reduction <add>, %26 : vector<4xf32> into f32 | |
%shuffleResult_24, %valid_25 = gpu.shuffle xor %27, %c1_i32, %c64_i32 : f32 | |
%28 = arith.addf %27, %shuffleResult_24 : f32 | |
%shuffleResult_26, %valid_27 = gpu.shuffle xor %28, %c2_i32, %c64_i32 : f32 | |
%29 = arith.addf %28, %shuffleResult_26 : f32 | |
%shuffleResult_28, %valid_29 = gpu.shuffle xor %29, %c4_i32, %c64_i32 : f32 | |
%30 = arith.addf %29, %shuffleResult_28 : f32 | |
%shuffleResult_30, %valid_31 = gpu.shuffle xor %30, %c8_i32, %c64_i32 : f32 | |
%31 = arith.addf %30, %shuffleResult_30 : f32 | |
%shuffleResult_32, %valid_33 = gpu.shuffle xor %31, %c16_i32, %c64_i32 : f32 | |
%32 = arith.addf %31, %shuffleResult_32 : f32 | |
%shuffleResult_34, %valid_35 = gpu.shuffle xor %32, %c32_i32, %c64_i32 : f32 | |
%33 = arith.addf %32, %shuffleResult_34 : f32 | |
%alloc_36 = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>> | |
scf.if %16 { | |
memref.store %33, %alloc_36[%14] : memref<16xf32, #gpu.address_space<workgroup>> | |
} | |
gpu.barrier | |
%34 = memref.load %alloc_36[%17] : memref<16xf32, #gpu.address_space<workgroup>> | |
%shuffleResult_37, %valid_38 = gpu.shuffle xor %34, %c1_i32, %c64_i32 : f32 | |
%35 = arith.addf %34, %shuffleResult_37 : f32 | |
%shuffleResult_39, %valid_40 = gpu.shuffle xor %35, %c2_i32, %c64_i32 : f32 | |
%36 = arith.addf %35, %shuffleResult_39 : f32 | |
%shuffleResult_41, %valid_42 = gpu.shuffle xor %36, %c4_i32, %c64_i32 : f32 | |
%37 = arith.addf %36, %shuffleResult_41 : f32 | |
%shuffleResult_43, %valid_44 = gpu.shuffle xor %37, %c8_i32, %c64_i32 : f32 | |
%38 = arith.addf %37, %shuffleResult_43 : f32 | |
%shuffleResult_45, %valid_46 = gpu.shuffle idx %38, %c0_i32, %c64_i32 : f32 | |
%39 = arith.addf %shuffleResult_45, %cst_3 : f32 | |
%alloc_47 = memref.alloc() : memref<16xf32, #gpu.address_space<workgroup>> | |
scf.if %16 { | |
memref.store %13, %alloc_47[%14] : memref<16xf32, #gpu.address_space<workgroup>> | |
} | |
gpu.barrier | |
%40 = memref.load %alloc_47[%17] : memref<16xf32, #gpu.address_space<workgroup>> | |
%shuffleResult_48, %valid_49 = gpu.shuffle xor %40, %c1_i32, %c64_i32 : f32 | |
%41 = arith.addf %40, %shuffleResult_48 : f32 | |
%shuffleResult_50, %valid_51 = gpu.shuffle xor %41, %c2_i32, %c64_i32 : f32 | |
%42 = arith.addf %41, %shuffleResult_50 : f32 | |
%shuffleResult_52, %valid_53 = gpu.shuffle xor %42, %c4_i32, %c64_i32 : f32 | |
%43 = arith.addf %42, %shuffleResult_52 : f32 | |
%shuffleResult_54, %valid_55 = gpu.shuffle xor %43, %c8_i32, %c64_i32 : f32 | |
%44 = arith.addf %43, %shuffleResult_54 : f32 | |
%shuffleResult_56, %valid_57 = gpu.shuffle idx %44, %c0_i32, %c64_i32 : f32 | |
%45 = arith.addf %shuffleResult_56, %cst_3 : f32 | |
%46 = vector.broadcast %39 : f32 to vector<4xf32> | |
%47 = vector.broadcast %45 : f32 to vector<4xf32> | |
%48 = arith.divf %46, %cst_0 : vector<4xf32> | |
%49 = arith.addf %48, %cst : vector<4xf32> | |
%50 = math.rsqrt %49 : vector<4xf32> | |
%51 = arith.divf %47, %cst_0 : vector<4xf32> | |
scf.for %arg0 = %c0 to %c40 step %c1 { | |
%52:2 = affine.delinearize_index %arg0 into (10, 4) : index, index | |
%53 = affine.apply affine_map<(d0)[s0] -> (d0 * 4096 + s0 * 4)>(%52#1)[%thread_id_x] | |
%54 = vector.transfer_read %3[%workgroup_id_y, %workgroup_id_x, %52#0, %53], %cst_2 {in_bounds = [true]} : memref<2x32x10x16384xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16> | |
%55 = arith.extf %54 : vector<4xf16> to vector<4xf32> | |
%56 = arith.subf %55, %51 : vector<4xf32> | |
%57 = arith.mulf %56, %50 : vector<4xf32> | |
vector.transfer_write %57, %5[%workgroup_id_y, %workgroup_id_x, %52#0, %53] {in_bounds = [true]} : vector<4xf32>, memref<2x32x10x16384xf32, #amdgpu.address_space<fat_raw_buffer>> | |
} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment