pashu123 · April 22, 2025 23:45
diff --git a/err.txt b/err.txt
 I am inside the populateWarpAndThreadIndices
 The rank is 1
 The threadIds size is2
 I am inside the populateWarpAndThreadIndices
 The rank is 0
 The threadIds size is0
 I am inside the populateWarpAndThreadIndices
 The rank is 1
 The threadIds size is2
 I am inside the populateWarpAndThreadIndices
 The rank is 1
 The threadIds size is2
 I am inside the populateWarpAndThreadIndices
 The rank is 1
 The threadIds size is2
 I am inside the populateWarpAndThreadIndices
 The rank is 1
 The threadIds size is2
 // -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- //
 func.func @main$async_dispatch_0_generic_2048x1280_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [320, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x5xf32>
  %c63 = arith.constant 63 : index
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16>
  %c0 = arith.constant 0 : index
  %cst_2 = arith.constant 1.279000e+03 : f32
  %cst_3 = arith.constant 0.000000e+00 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : vector<1x1x8xf32>
  %cst_5 = arith.constant 0.000000e+00 : f16
  %cst_6 = arith.constant dense<1.280000e+03> : vector<1x1x8xf32>
  %thread_id_z = gpu.thread_id  z
  %thread_id_y = gpu.thread_id  y
  %thread_id_x = gpu.thread_id  x
  %0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 320) : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = amdgpu.fat_raw_buffer_cast %1 resetOffset : memref<2048x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<2048xf16, #hal.descriptor_type<storage_buffer>>
  %4 = amdgpu.fat_raw_buffer_cast %3 resetOffset : memref<2048xf16, #hal.descriptor_type<storage_buffer>> to memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>>
  memref.assume_alignment %4, 64 : memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>>
  scf.forall (%arg0) in (2048) {
    %subview = memref.subview %4[%arg0] [1] [1] : memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>> to memref<1xf16, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
    %subview_7 = memref.subview %subview[0] [1] [1] : memref<1xf16, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> to memref<f16, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
    %5:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
    %6:2 = affine.delinearize_index %0 into (32) : index, index
    %7 = affine.linearize_index disjoint [%5#1, %c0, %c0, %6#1, %c0] by (5, 1, 1, 32, 8) : index
    %8 = vector.transfer_read %2[%arg0, %7], %cst_5 {in_bounds = [true]} : memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<8xf16>
    %9 = vector.insert_strided_slice %8, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16>
    %10 = arith.extf %9 : vector<1x1x8xf16> to vector<1x1x8xf32>
    %11 = arith.addf %10, %cst_4 : vector<1x1x8xf32>
    %12 = vector.multi_reduction <add>, %11, %cst_3 [0, 1, 2] : vector<1x1x8xf32> to f32
    %13 = gpu.subgroup_reduce  add %12 cluster(size = 32) : (f32) -> f32
    %14 = vector.insert %13, %cst_0 [0] : f32 into vector<1xf32>
    %alloc = memref.alloc() : memref<5xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %15:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
    %16:2 = affine.delinearize_index %0 into (1) : index, index
    %17 = affine.linearize_index disjoint [%15#1, %c0, %c0, %16#1, %c0] by (5, 1, 1, 1, 1) : index
    vector.transfer_write %14, %alloc[%17] {in_bounds = [true]} : vector<1xf32>, memref<5xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %18:3 = affine.delinearize_index %0 into (1, 64) : index, index, index
    %19:2 = affine.delinearize_index %0 into (1) : index, index
    %20 = affine.linearize_index disjoint [%18#1, %c0, %c0, %19#1, %c0] by (1, 1, 1, 1, 5) : index
    %21 = vector.transfer_read %alloc[%20], %cst_3 {in_bounds = [true]} : memref<5xf32, #gpu.address_space<workgroup>>, vector<5xf32>
    %22 = vector.insert_strided_slice %21, %cst {offsets = [0, 0, 0], strides = [1]} : vector<5xf32> into vector<1x1x5xf32>
    %23 = vector.multi_reduction <add>, %22, %cst_3 [0, 1, 2] : vector<1x1x5xf32> to f32
    %24 = vector.broadcast %23 : f32 to vector<1xf32>
    %25 = arith.addf %24, %cst_0 : vector<1xf32>
    %26 = vector.extract %25[0] : f32 from vector<1xf32>
    %27 = vector.broadcast %26 : f32 to vector<8xf32>
    %28 = vector.insert %27, %cst_4 [0, 0] : vector<8xf32> into vector<1x1x8xf32>
    %29 = arith.divf %28, %cst_6 : vector<1x1x8xf32>
    %30 = arith.subf %10, %29 : vector<1x1x8xf32>
    %31 = arith.mulf %30, %30 : vector<1x1x8xf32>
    %32 = arith.addf %31, %cst_4 : vector<1x1x8xf32>
    %33 = vector.multi_reduction <add>, %32, %cst_3 [0, 1, 2] : vector<1x1x8xf32> to f32
    %34 = gpu.subgroup_reduce  add %33 cluster(size = 32) : (f32) -> f32
    %35 = vector.insert %34, %cst_0 [0] : f32 into vector<1xf32>
    %alloc_8 = memref.alloc() : memref<5xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %36:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
    %37:2 = affine.delinearize_index %0 into (1) : index, index
    %38 = affine.linearize_index disjoint [%36#1, %c0, %c0, %37#1, %c0] by (5, 1, 1, 1, 1) : index
    vector.transfer_write %35, %alloc_8[%38] {in_bounds = [true]} : vector<1xf32>, memref<5xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %39:3 = affine.delinearize_index %0 into (1, 64) : index, index, index
    %40:2 = affine.delinearize_index %0 into (1) : index, index
    %41 = affine.linearize_index disjoint [%39#1, %c0, %c0, %40#1, %c0] by (1, 1, 1, 1, 5) : index
    %42 = vector.transfer_read %alloc_8[%41], %cst_3 {in_bounds = [true]} : memref<5xf32, #gpu.address_space<workgroup>>, vector<5xf32>
    %43 = vector.insert_strided_slice %42, %cst {offsets = [0, 0, 0], strides = [1]} : vector<5xf32> into vector<1x1x5xf32>
    %44 = vector.multi_reduction <add>, %43, %cst_3 [0, 1, 2] : vector<1x1x5xf32> to f32
    %45 = vector.broadcast %44 : f32 to vector<1xf32>
    %46 = arith.addf %45, %cst_0 : vector<1xf32>
    %47 = vector.extract %46[0] : f32 from vector<1xf32>
    %48 = arith.divf %47, %cst_2 : f32
    %49 = arith.truncf %48 : f32 to f16
    %50 = vector.broadcast %49 : f16 to vector<f16>
    %51 = arith.cmpi eq, %0, %c63 : index
    scf.if %51 {
      vector.transfer_write %50, %subview_7[] : vector<f16>, memref<f16, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
  return
 }

 iree-compile: for the --iree-codegen-gpu-native-math-precision option: This option is deprecated, does not do anything anymore, and will be removed soon. It was mainly used on the ROCm target, but the behavior that it once enabled is now default on ROCm. More generally, MathTransformPass should do the right things for each target.
	I am inside the populateWarpAndThreadIndices
	The rank is 1
	The threadIds size is2
	I am inside the populateWarpAndThreadIndices
	The rank is 0
	The threadIds size is0
	I am inside the populateWarpAndThreadIndices
	The rank is 1
	The threadIds size is2
	I am inside the populateWarpAndThreadIndices
	The rank is 1
	The threadIds size is2
	I am inside the populateWarpAndThreadIndices
	The rank is 1
	The threadIds size is2
	I am inside the populateWarpAndThreadIndices
	The rank is 1
	The threadIds size is2
	// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- //
	func.func @main$async_dispatch_0_generic_2048x1280_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [320, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%cst = arith.constant dense<0.000000e+00> : vector<1x1x5xf32>
	%c63 = arith.constant 63 : index
	%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
	%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16>
	%c0 = arith.constant 0 : index
	%cst_2 = arith.constant 1.279000e+03 : f32
	%cst_3 = arith.constant 0.000000e+00 : f32
	%cst_4 = arith.constant dense<0.000000e+00> : vector<1x1x8xf32>
	%cst_5 = arith.constant 0.000000e+00 : f16
	%cst_6 = arith.constant dense<1.280000e+03> : vector<1x1x8xf32>
	%thread_id_z = gpu.thread_id z
	%thread_id_y = gpu.thread_id y
	%thread_id_x = gpu.thread_id x
	%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 320) : index
	%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<2048x1280xf16, #hal.descriptor_type<storage_buffer>>
	%2 = amdgpu.fat_raw_buffer_cast %1 resetOffset : memref<2048x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %2, 64 : memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>
	%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<2048xf16, #hal.descriptor_type<storage_buffer>>
	%4 = amdgpu.fat_raw_buffer_cast %3 resetOffset : memref<2048xf16, #hal.descriptor_type<storage_buffer>> to memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %4, 64 : memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>>
	scf.forall (%arg0) in (2048) {
	%subview = memref.subview %4[%arg0] [1] [1] : memref<2048xf16, #amdgpu.address_space<fat_raw_buffer>> to memref<1xf16, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
	%subview_7 = memref.subview %subview[0] [1] [1] : memref<1xf16, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> to memref<f16, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
	%5:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
	%6:2 = affine.delinearize_index %0 into (32) : index, index
	%7 = affine.linearize_index disjoint [%5#1, %c0, %c0, %6#1, %c0] by (5, 1, 1, 32, 8) : index
	%8 = vector.transfer_read %2[%arg0, %7], %cst_5 {in_bounds = [true]} : memref<2048x1280xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<8xf16>
	%9 = vector.insert_strided_slice %8, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16>
	%10 = arith.extf %9 : vector<1x1x8xf16> to vector<1x1x8xf32>
	%11 = arith.addf %10, %cst_4 : vector<1x1x8xf32>
	%12 = vector.multi_reduction <add>, %11, %cst_3 [0, 1, 2] : vector<1x1x8xf32> to f32
	%13 = gpu.subgroup_reduce add %12 cluster(size = 32) : (f32) -> f32
	%14 = vector.insert %13, %cst_0 [0] : f32 into vector<1xf32>
	%alloc = memref.alloc() : memref<5xf32, #gpu.address_space<workgroup>>
	gpu.barrier
	%15:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
	%16:2 = affine.delinearize_index %0 into (1) : index, index
	%17 = affine.linearize_index disjoint [%15#1, %c0, %c0, %16#1, %c0] by (5, 1, 1, 1, 1) : index
	vector.transfer_write %14, %alloc[%17] {in_bounds = [true]} : vector<1xf32>, memref<5xf32, #gpu.address_space<workgroup>>
	gpu.barrier
	%18:3 = affine.delinearize_index %0 into (1, 64) : index, index, index
	%19:2 = affine.delinearize_index %0 into (1) : index, index
	%20 = affine.linearize_index disjoint [%18#1, %c0, %c0, %19#1, %c0] by (1, 1, 1, 1, 5) : index
	%21 = vector.transfer_read %alloc[%20], %cst_3 {in_bounds = [true]} : memref<5xf32, #gpu.address_space<workgroup>>, vector<5xf32>
	%22 = vector.insert_strided_slice %21, %cst {offsets = [0, 0, 0], strides = [1]} : vector<5xf32> into vector<1x1x5xf32>
	%23 = vector.multi_reduction <add>, %22, %cst_3 [0, 1, 2] : vector<1x1x5xf32> to f32
	%24 = vector.broadcast %23 : f32 to vector<1xf32>
	%25 = arith.addf %24, %cst_0 : vector<1xf32>
	%26 = vector.extract %25[0] : f32 from vector<1xf32>
	%27 = vector.broadcast %26 : f32 to vector<8xf32>
	%28 = vector.insert %27, %cst_4 [0, 0] : vector<8xf32> into vector<1x1x8xf32>
	%29 = arith.divf %28, %cst_6 : vector<1x1x8xf32>
	%30 = arith.subf %10, %29 : vector<1x1x8xf32>
	%31 = arith.mulf %30, %30 : vector<1x1x8xf32>
	%32 = arith.addf %31, %cst_4 : vector<1x1x8xf32>
	%33 = vector.multi_reduction <add>, %32, %cst_3 [0, 1, 2] : vector<1x1x8xf32> to f32
	%34 = gpu.subgroup_reduce add %33 cluster(size = 32) : (f32) -> f32
	%35 = vector.insert %34, %cst_0 [0] : f32 into vector<1xf32>
	%alloc_8 = memref.alloc() : memref<5xf32, #gpu.address_space<workgroup>>
	gpu.barrier
	%36:3 = affine.delinearize_index %0 into (5, 64) : index, index, index
	%37:2 = affine.delinearize_index %0 into (1) : index, index
	%38 = affine.linearize_index disjoint [%36#1, %c0, %c0, %37#1, %c0] by (5, 1, 1, 1, 1) : index
	vector.transfer_write %35, %alloc_8[%38] {in_bounds = [true]} : vector<1xf32>, memref<5xf32, #gpu.address_space<workgroup>>
	gpu.barrier
	%39:3 = affine.delinearize_index %0 into (1, 64) : index, index, index
	%40:2 = affine.delinearize_index %0 into (1) : index, index
	%41 = affine.linearize_index disjoint [%39#1, %c0, %c0, %40#1, %c0] by (1, 1, 1, 1, 5) : index
	%42 = vector.transfer_read %alloc_8[%41], %cst_3 {in_bounds = [true]} : memref<5xf32, #gpu.address_space<workgroup>>, vector<5xf32>
	%43 = vector.insert_strided_slice %42, %cst {offsets = [0, 0, 0], strides = [1]} : vector<5xf32> into vector<1x1x5xf32>
	%44 = vector.multi_reduction <add>, %43, %cst_3 [0, 1, 2] : vector<1x1x5xf32> to f32
	%45 = vector.broadcast %44 : f32 to vector<1xf32>
	%46 = arith.addf %45, %cst_0 : vector<1xf32>
	%47 = vector.extract %46[0] : f32 from vector<1xf32>
	%48 = arith.divf %47, %cst_2 : f32
	%49 = arith.truncf %48 : f32 to f16
	%50 = vector.broadcast %49 : f16 to vector<f16>
	%51 = arith.cmpi eq, %0, %c63 : index
	scf.if %51 {
	vector.transfer_write %50, %subview_7[] : vector<f16>, memref<f16, strided<[], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>
	}
	} {mapping = [#iree_codegen.workgroup_mapping<x>]}
	return
	}

	iree-compile: for the --iree-codegen-gpu-native-math-precision option: This option is deprecated, does not do anything anymore, and will be removed soon. It was mainly used on the ROCm target, but the behavior that it once enabled is now default on ROCm. More generally, MathTransformPass should do the right things for each target.