Created
April 22, 2025 23:37
-
-
Save pashu123/5008600a8e153cb7249e803731511acd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I am inside the populateWarpAndThreadIndices | |
The rank is 1 | |
The threadIds size is2 | |
I am inside the populateWarpAndThreadIndices | |
The rank is 1 | |
The threadIds size is2 | |
I am inside the populateWarpAndThreadIndices | |
The rank is 0 | |
The threadIds size is0 | |
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- // | |
func.func @matvec_dispatch_0_matvec_like_32000x4096_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c63 = arith.constant 63 : index | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c4096 = arith.constant 4096 : index | |
%c512 = arith.constant 512 : index | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x8xf32> | |
%thread_id_z = gpu.thread_id z | |
%thread_id_y = gpu.thread_id y | |
%thread_id_x = gpu.thread_id x | |
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 64) : index | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = amdgpu.fat_raw_buffer_cast %1 resetOffset : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<32000x4096xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %2, 64 : memref<32000x4096xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<4096xf16, #hal.descriptor_type<storage_buffer>> | |
%4 = amdgpu.fat_raw_buffer_cast %3 resetOffset : memref<4096xf16, #hal.descriptor_type<storage_buffer>> to memref<4096xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %4, 64 : memref<4096xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<32000xf32, #hal.descriptor_type<storage_buffer>> | |
%6 = amdgpu.fat_raw_buffer_cast %5 resetOffset : memref<32000xf32, #hal.descriptor_type<storage_buffer>> to memref<32000xf32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %6, 64 : memref<32000xf32, #amdgpu.address_space<fat_raw_buffer>> | |
scf.forall (%arg0) in (32000) { | |
%subview = memref.subview %6[%arg0] [1] [1] : memref<32000xf32, #amdgpu.address_space<fat_raw_buffer>> to memref<1xf32, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> | |
%subview_4 = memref.subview %subview[0] [1] [1] : memref<1xf32, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> to memref<f32, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> | |
%7 = scf.for %arg1 = %c0 to %c4096 step %c512 iter_args(%arg2 = %cst_3) -> (vector<1x1x8xf32>) { | |
%15:3 = affine.delinearize_index %0 into (1, 64) : index, index, index | |
%16:2 = affine.delinearize_index %0 into (64) : index, index | |
%17 = affine.linearize_index [%15#1, %c0, %c0, %16#1, %arg1] by (1, 1, 1, 64, 8) : index | |
%18 = vector.transfer_read %2[%arg0, %17], %cst_2 {in_bounds = [true]} : memref<32000x4096xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<8xf16> | |
%19 = vector.insert_strided_slice %18, %cst_0 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%20:3 = affine.delinearize_index %0 into (1, 64) : index, index, index | |
%21:2 = affine.delinearize_index %0 into (64) : index, index | |
%22 = affine.linearize_index [%20#1, %c0, %c0, %21#1, %arg1] by (1, 1, 1, 64, 8) : index | |
%23 = vector.transfer_read %4[%22], %cst_2 {in_bounds = [true]} : memref<4096xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<8xf16> | |
%24 = vector.insert_strided_slice %23, %cst_0 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%25 = arith.extf %19 : vector<1x1x8xf16> to vector<1x1x8xf32> | |
%26 = arith.extf %24 : vector<1x1x8xf16> to vector<1x1x8xf32> | |
%27 = arith.mulf %25, %26 : vector<1x1x8xf32> | |
%28 = arith.addf %arg2, %27 : vector<1x1x8xf32> | |
scf.yield %28 : vector<1x1x8xf32> | |
} | |
%8 = vector.multi_reduction <add>, %7, %cst_1 [0, 1, 2] : vector<1x1x8xf32> to f32 | |
%9 = gpu.subgroup_reduce add %8 cluster(size = 64) : (f32) -> f32 | |
%10 = vector.insert %9, %cst [0] : f32 into vector<1xf32> | |
%11 = arith.addf %10, %cst : vector<1xf32> | |
%12 = vector.extract %11[0] : f32 from vector<1xf32> | |
%13 = vector.broadcast %12 : f32 to vector<f32> | |
%14 = arith.cmpi eq, %0, %c63 : index | |
scf.if %14 { | |
vector.transfer_write %13, %subview_4[] : vector<f32>, memref<f32, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
iree-compile: for the --iree-codegen-gpu-native-math-precision option: This option is deprecated, does not do anything anymore, and will be removed soon. It was mainly used on the ROCm target, but the behavior that it once enabled is now default on ROCm. More generally, MathTransformPass should do the right things for each target. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment