pashu123 · April 24, 2025 19:35
diff --git a/vect_dist_llvm_before.txt b/vect_dist_llvm_before.txt
 // -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
 module {
  func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
    %cst = arith.constant 0.000000e+00 : f32
    %c0_i32 = arith.constant 0 : i32
    %c256_i32 = arith.constant 256 : i32
    %c32_i32 = arith.constant 32 : i32
    %c16_i32 = arith.constant 16 : i32
    %c8_i32 = arith.constant 8 : i32
    %c4_i32 = arith.constant 4 : i32
    %c2_i32 = arith.constant 2 : i32
    %c64_i32 = arith.constant 64 : i32
    %c1_i32 = arith.constant 1 : i32
    %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf16>
    %cst_1 = arith.constant dense<0.000000e+00> : vector<1xf32>
    %cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
    %c0 = arith.constant 0 : index
    %cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>
    %thread_id_x = gpu.thread_id  x upper_bound 704
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
    %1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
    %3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
    %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
    %5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
    %workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
    %6 = arith.index_castui %thread_id_x : index to i32
    %7 = arith.divui %6, %c64_i32 : i32
    %8 = arith.index_castui %7 : i32 to index
    %9 = arith.remui %6, %c64_i32 : i32
    %10 = arith.muli %7, %c256_i32 : i32
    %11 = arith.muli %9, %c4_i32 : i32
    %12 = arith.addi %10, %11 : i32
    %13 = arith.index_castui %12 : i32 to index
    cf.br ^bb1(%c0_i32, %cst_0 : i32, vector<4xf16>)
  ^bb1(%14: i32, %15: vector<4xf16>):  // 2 preds: ^bb0, ^bb2
    %16 = arith.cmpi slt, %14, %c4_i32 : i32
    cf.cond_br %16, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %17 = arith.index_castui %14 : i32 to index
    %18 = arith.addi %12, %14 : i32
    %19 = arith.index_castui %18 : i32 to index
    %20 = memref.load %1[%19, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    %21 = vector.insertelement %20, %15[%17 : index] : vector<4xf16>
    %22 = arith.addi %14, %c1_i32 : i32
    cf.br ^bb1(%22, %21 : i32, vector<4xf16>)
  ^bb3:  // pred: ^bb1
    %23 = vector.insert_strided_slice %15, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
    %24 = vector.load %3[%workgroup_id_x, %13] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
    %25 = vector.insert_strided_slice %24, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
    %26 = arith.extf %23 : vector<1x1x4xf16> to vector<1x1x4xf32>
    %27 = arith.extf %25 : vector<1x1x4xf16> to vector<1x1x4xf32>
    %28 = arith.mulf %26, %27 : vector<1x1x4xf32>
    %29 = arith.addf %28, %cst_3 : vector<1x1x4xf32>
    %30 = vector.extract %29[0, 0, 0] : f32 from vector<1x1x4xf32>
    %31 = vector.extract %29[0, 0, 1] : f32 from vector<1x1x4xf32>
    %32 = vector.extract %29[0, 0, 2] : f32 from vector<1x1x4xf32>
    %33 = vector.extract %29[0, 0, 3] : f32 from vector<1x1x4xf32>
    %34 = arith.addf %30, %cst : f32
    %35 = arith.addf %31, %34 : f32
    %36 = arith.addf %32, %35 : f32
    %37 = arith.addf %33, %36 : f32
    %shuffleResult, %valid = gpu.shuffle  xor %37, %c1_i32, %c64_i32 : f32
    %38 = arith.addf %37, %shuffleResult : f32
    %shuffleResult_4, %valid_5 = gpu.shuffle  xor %38, %c2_i32, %c64_i32 : f32
    %39 = arith.addf %38, %shuffleResult_4 : f32
    %shuffleResult_6, %valid_7 = gpu.shuffle  xor %39, %c4_i32, %c64_i32 : f32
    %40 = arith.addf %39, %shuffleResult_6 : f32
    %shuffleResult_8, %valid_9 = gpu.shuffle  xor %40, %c8_i32, %c64_i32 : f32
    %41 = arith.addf %40, %shuffleResult_8 : f32
    %shuffleResult_10, %valid_11 = gpu.shuffle  xor %41, %c16_i32, %c64_i32 : f32
    %42 = arith.addf %41, %shuffleResult_10 : f32
    %shuffleResult_12, %valid_13 = gpu.shuffle  xor %42, %c32_i32, %c64_i32 : f32
    %43 = arith.addf %42, %shuffleResult_12 : f32
    %44 = vector.insert %43, %cst_1 [0] : f32 into vector<1xf32>
    %alloc = memref.alloc() : memref<13xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %45 = arith.cmpi eq, %9, %c0_i32 : i32
    cf.cond_br %45, ^bb4, ^bb5
  ^bb4:  // pred: ^bb3
    vector.store %44, %alloc[%8] : memref<13xf32, #gpu.address_space<workgroup>>, vector<1xf32>
    cf.br ^bb5
  ^bb5:  // 2 preds: ^bb3, ^bb4
    gpu.barrier
    %46 = vector.load %alloc[%c0] : memref<13xf32, #gpu.address_space<workgroup>>, vector<11xf32>
    %47 = vector.extract %46[0] : f32 from vector<11xf32>
    %48 = vector.extract %46[1] : f32 from vector<11xf32>
    %49 = vector.extract %46[2] : f32 from vector<11xf32>
    %50 = vector.extract %46[3] : f32 from vector<11xf32>
    %51 = vector.extract %46[4] : f32 from vector<11xf32>
    %52 = vector.extract %46[5] : f32 from vector<11xf32>
    %53 = vector.extract %46[6] : f32 from vector<11xf32>
    %54 = vector.extract %46[7] : f32 from vector<11xf32>
    %55 = vector.extract %46[8] : f32 from vector<11xf32>
    %56 = vector.extract %46[9] : f32 from vector<11xf32>
    %57 = vector.extract %46[10] : f32 from vector<11xf32>
    %58 = arith.addf %47, %cst : f32
    %59 = arith.addf %48, %58 : f32
    %60 = arith.addf %49, %59 : f32
    %61 = arith.addf %50, %60 : f32
    %62 = arith.addf %51, %61 : f32
    %63 = arith.addf %52, %62 : f32
    %64 = arith.addf %53, %63 : f32
    %65 = arith.addf %54, %64 : f32
    %66 = arith.addf %55, %65 : f32
    %67 = arith.addf %56, %66 : f32
    %68 = arith.addf %57, %67 : f32
    %69 = arith.addf %68, %cst : f32
    %70 = vector.splat %69 : vector<f32>
    %71 = arith.cmpi eq, %6, %c0_i32 : i32
    cf.cond_br %71, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vector.store %70, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<f32>
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    return
  }
 }
diff --git a/warp_reduce_llvm_before.txt b/warp_reduce_llvm_before.txt
  func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
    %c11_i32 = arith.constant 11 : i32
    %c10_i32 = arith.constant 10 : i32
    %cst = arith.constant dense<0.000000e+00> : vector<4xf16>
    %c0_i32 = arith.constant 0 : i32
    %c32_i32 = arith.constant 32 : i32
    %c16_i32 = arith.constant 16 : i32
    %c8_i32 = arith.constant 8 : i32
    %c4_i32 = arith.constant 4 : i32
    %c2_i32 = arith.constant 2 : i32
    %c64_i32 = arith.constant 64 : i32
    %c1_i32 = arith.constant 1 : i32
    %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
    %c0 = arith.constant 0 : index
    %cst_1 = arith.constant 0.000000e+00 : f32
    %thread_id_x = gpu.thread_id  x upper_bound 704
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
    %1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
    %3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
    %4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
    %5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
    memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
    %workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
    %workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
    %6 = arith.index_castui %thread_id_x : index to i32
    %7 = arith.muli %6, %c4_i32 overflow<nsw> : i32
    %8 = arith.index_castui %7 : i32 to index
    %9 = vector.load %3[%workgroup_id_x, %8] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
    cf.br ^bb1(%c0_i32, %cst : i32, vector<4xf16>)
  ^bb1(%10: i32, %11: vector<4xf16>):  // 2 preds: ^bb0, ^bb2
    %12 = arith.cmpi slt, %10, %c4_i32 : i32
    cf.cond_br %12, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %13 = arith.index_castui %10 : i32 to index
    %14 = arith.addi %7, %10 : i32
    %15 = arith.index_castui %14 : i32 to index
    %16 = memref.load %1[%15, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
    %17 = vector.insertelement %16, %11[%13 : index] : vector<4xf16>
    %18 = arith.addi %10, %c1_i32 : i32
    cf.br ^bb1(%18, %17 : i32, vector<4xf16>)
  ^bb3:  // pred: ^bb1
    %19 = arith.extf %9 : vector<4xf16> to vector<4xf32>
    %20 = arith.extf %11 : vector<4xf16> to vector<4xf32>
    %21 = arith.mulf %20, %19 : vector<4xf32>
    %22 = arith.addf %21, %cst_0 : vector<4xf32>
    %23 = vector.reduction <add>, %22 : vector<4xf32> into f32
    %shuffleResult, %valid = gpu.shuffle  xor %23, %c1_i32, %c64_i32 : f32
    %24 = arith.addf %23, %shuffleResult : f32
    %shuffleResult_2, %valid_3 = gpu.shuffle  xor %24, %c2_i32, %c64_i32 : f32
    %25 = arith.addf %24, %shuffleResult_2 : f32
    %shuffleResult_4, %valid_5 = gpu.shuffle  xor %25, %c4_i32, %c64_i32 : f32
    %26 = arith.addf %25, %shuffleResult_4 : f32
    %shuffleResult_6, %valid_7 = gpu.shuffle  xor %26, %c8_i32, %c64_i32 : f32
    %27 = arith.addf %26, %shuffleResult_6 : f32
    %shuffleResult_8, %valid_9 = gpu.shuffle  xor %27, %c16_i32, %c64_i32 : f32
    %28 = arith.addf %27, %shuffleResult_8 : f32
    %shuffleResult_10, %valid_11 = gpu.shuffle  xor %28, %c32_i32, %c64_i32 : f32
    %29 = arith.addf %28, %shuffleResult_10 : f32
    %alloc = memref.alloc() : memref<11xf32, #gpu.address_space<workgroup>>
    %30 = arith.divui %6, %c64_i32 : i32
    %31 = arith.index_castui %30 : i32 to index
    %32 = arith.remui %6, %c64_i32 : i32
    %33 = arith.cmpi eq, %32, %c0_i32 : i32
    cf.cond_br %33, ^bb4, ^bb5
  ^bb4:  // pred: ^bb3
    memref.store %29, %alloc[%31] : memref<11xf32, #gpu.address_space<workgroup>>
    cf.br ^bb5
  ^bb5:  // 2 preds: ^bb3, ^bb4
    gpu.barrier
    %34 = arith.minui %32, %c10_i32 : i32
    %35 = arith.index_castui %34 : i32 to index
    %36 = memref.load %alloc[%35] : memref<11xf32, #gpu.address_space<workgroup>>
    %37 = arith.cmpi sge, %32, %c11_i32 : i32
    %38 = arith.select %37, %cst_1, %36 : f32
    %shuffleResult_12, %valid_13 = gpu.shuffle  xor %38, %c1_i32, %c64_i32 : f32
    %39 = arith.addf %38, %shuffleResult_12 : f32
    %shuffleResult_14, %valid_15 = gpu.shuffle  xor %39, %c2_i32, %c64_i32 : f32
    %40 = arith.addf %39, %shuffleResult_14 : f32
    %shuffleResult_16, %valid_17 = gpu.shuffle  xor %40, %c4_i32, %c64_i32 : f32
    %41 = arith.addf %40, %shuffleResult_16 : f32
    %shuffleResult_18, %valid_19 = gpu.shuffle  xor %41, %c8_i32, %c64_i32 : f32
    %42 = arith.addf %41, %shuffleResult_18 : f32
    %shuffleResult_20, %valid_21 = gpu.shuffle  idx %42, %c0_i32, %c64_i32 : f32
    %43 = arith.addf %shuffleResult_20, %cst_1 : f32
    %44 = vector.splat %43 : vector<1xf32>
    %45 = arith.cmpi eq, %6, %c0_i32 : i32
    cf.cond_br %45, ^bb6, ^bb7
  ^bb6:  // pred: ^bb5
    vector.store %44, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
    cf.br ^bb7
  ^bb7:  // 2 preds: ^bb5, ^bb6
    return
  }
 }
	// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
	module {
	func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
	%cst = arith.constant 0.000000e+00 : f32
	%c0_i32 = arith.constant 0 : i32
	%c256_i32 = arith.constant 256 : i32
	%c32_i32 = arith.constant 32 : i32
	%c16_i32 = arith.constant 16 : i32
	%c8_i32 = arith.constant 8 : i32
	%c4_i32 = arith.constant 4 : i32
	%c2_i32 = arith.constant 2 : i32
	%c64_i32 = arith.constant 64 : i32
	%c1_i32 = arith.constant 1 : i32
	%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf16>
	%cst_1 = arith.constant dense<0.000000e+00> : vector<1xf32>
	%cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
	%c0 = arith.constant 0 : index
	%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>
	%thread_id_x = gpu.thread_id x upper_bound 704
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
	%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
	%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
	%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
	%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
	%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
	%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
	%6 = arith.index_castui %thread_id_x : index to i32
	%7 = arith.divui %6, %c64_i32 : i32
	%8 = arith.index_castui %7 : i32 to index
	%9 = arith.remui %6, %c64_i32 : i32
	%10 = arith.muli %7, %c256_i32 : i32
	%11 = arith.muli %9, %c4_i32 : i32
	%12 = arith.addi %10, %11 : i32
	%13 = arith.index_castui %12 : i32 to index
	cf.br ^bb1(%c0_i32, %cst_0 : i32, vector<4xf16>)
	^bb1(%14: i32, %15: vector<4xf16>): // 2 preds: ^bb0, ^bb2
	%16 = arith.cmpi slt, %14, %c4_i32 : i32
	cf.cond_br %16, ^bb2, ^bb3
	^bb2: // pred: ^bb1
	%17 = arith.index_castui %14 : i32 to index
	%18 = arith.addi %12, %14 : i32
	%19 = arith.index_castui %18 : i32 to index
	%20 = memref.load %1[%19, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	%21 = vector.insertelement %20, %15[%17 : index] : vector<4xf16>
	%22 = arith.addi %14, %c1_i32 : i32
	cf.br ^bb1(%22, %21 : i32, vector<4xf16>)
	^bb3: // pred: ^bb1
	%23 = vector.insert_strided_slice %15, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
	%24 = vector.load %3[%workgroup_id_x, %13] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
	%25 = vector.insert_strided_slice %24, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
	%26 = arith.extf %23 : vector<1x1x4xf16> to vector<1x1x4xf32>
	%27 = arith.extf %25 : vector<1x1x4xf16> to vector<1x1x4xf32>
	%28 = arith.mulf %26, %27 : vector<1x1x4xf32>
	%29 = arith.addf %28, %cst_3 : vector<1x1x4xf32>
	%30 = vector.extract %29[0, 0, 0] : f32 from vector<1x1x4xf32>
	%31 = vector.extract %29[0, 0, 1] : f32 from vector<1x1x4xf32>
	%32 = vector.extract %29[0, 0, 2] : f32 from vector<1x1x4xf32>
	%33 = vector.extract %29[0, 0, 3] : f32 from vector<1x1x4xf32>
	%34 = arith.addf %30, %cst : f32
	%35 = arith.addf %31, %34 : f32
	%36 = arith.addf %32, %35 : f32
	%37 = arith.addf %33, %36 : f32
	%shuffleResult, %valid = gpu.shuffle xor %37, %c1_i32, %c64_i32 : f32
	%38 = arith.addf %37, %shuffleResult : f32
	%shuffleResult_4, %valid_5 = gpu.shuffle xor %38, %c2_i32, %c64_i32 : f32
	%39 = arith.addf %38, %shuffleResult_4 : f32
	%shuffleResult_6, %valid_7 = gpu.shuffle xor %39, %c4_i32, %c64_i32 : f32
	%40 = arith.addf %39, %shuffleResult_6 : f32
	%shuffleResult_8, %valid_9 = gpu.shuffle xor %40, %c8_i32, %c64_i32 : f32
	%41 = arith.addf %40, %shuffleResult_8 : f32
	%shuffleResult_10, %valid_11 = gpu.shuffle xor %41, %c16_i32, %c64_i32 : f32
	%42 = arith.addf %41, %shuffleResult_10 : f32
	%shuffleResult_12, %valid_13 = gpu.shuffle xor %42, %c32_i32, %c64_i32 : f32
	%43 = arith.addf %42, %shuffleResult_12 : f32
	%44 = vector.insert %43, %cst_1 [0] : f32 into vector<1xf32>
	%alloc = memref.alloc() : memref<13xf32, #gpu.address_space<workgroup>>
	gpu.barrier
	%45 = arith.cmpi eq, %9, %c0_i32 : i32
	cf.cond_br %45, ^bb4, ^bb5
	^bb4: // pred: ^bb3
	vector.store %44, %alloc[%8] : memref<13xf32, #gpu.address_space<workgroup>>, vector<1xf32>
	cf.br ^bb5
	^bb5: // 2 preds: ^bb3, ^bb4
	gpu.barrier
	%46 = vector.load %alloc[%c0] : memref<13xf32, #gpu.address_space<workgroup>>, vector<11xf32>
	%47 = vector.extract %46[0] : f32 from vector<11xf32>
	%48 = vector.extract %46[1] : f32 from vector<11xf32>
	%49 = vector.extract %46[2] : f32 from vector<11xf32>
	%50 = vector.extract %46[3] : f32 from vector<11xf32>
	%51 = vector.extract %46[4] : f32 from vector<11xf32>
	%52 = vector.extract %46[5] : f32 from vector<11xf32>
	%53 = vector.extract %46[6] : f32 from vector<11xf32>
	%54 = vector.extract %46[7] : f32 from vector<11xf32>
	%55 = vector.extract %46[8] : f32 from vector<11xf32>
	%56 = vector.extract %46[9] : f32 from vector<11xf32>
	%57 = vector.extract %46[10] : f32 from vector<11xf32>
	%58 = arith.addf %47, %cst : f32
	%59 = arith.addf %48, %58 : f32
	%60 = arith.addf %49, %59 : f32
	%61 = arith.addf %50, %60 : f32
	%62 = arith.addf %51, %61 : f32
	%63 = arith.addf %52, %62 : f32
	%64 = arith.addf %53, %63 : f32
	%65 = arith.addf %54, %64 : f32
	%66 = arith.addf %55, %65 : f32
	%67 = arith.addf %56, %66 : f32
	%68 = arith.addf %57, %67 : f32
	%69 = arith.addf %68, %cst : f32
	%70 = vector.splat %69 : vector<f32>
	%71 = arith.cmpi eq, %6, %c0_i32 : i32
	cf.cond_br %71, ^bb6, ^bb7
	^bb6: // pred: ^bb5
	vector.store %70, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<f32>
	cf.br ^bb7
	^bb7: // 2 preds: ^bb5, ^bb6
	return
	}
	}
	func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
	%c11_i32 = arith.constant 11 : i32
	%c10_i32 = arith.constant 10 : i32
	%cst = arith.constant dense<0.000000e+00> : vector<4xf16>
	%c0_i32 = arith.constant 0 : i32
	%c32_i32 = arith.constant 32 : i32
	%c16_i32 = arith.constant 16 : i32
	%c8_i32 = arith.constant 8 : i32
	%c4_i32 = arith.constant 4 : i32
	%c2_i32 = arith.constant 2 : i32
	%c64_i32 = arith.constant 64 : i32
	%c1_i32 = arith.constant 1 : i32
	%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
	%c0 = arith.constant 0 : index
	%cst_1 = arith.constant 0.000000e+00 : f32
	%thread_id_x = gpu.thread_id x upper_bound 704
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
	%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
	%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
	%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
	%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
	memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
	%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
	%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
	%6 = arith.index_castui %thread_id_x : index to i32
	%7 = arith.muli %6, %c4_i32 overflow<nsw> : i32
	%8 = arith.index_castui %7 : i32 to index
	%9 = vector.load %3[%workgroup_id_x, %8] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
	cf.br ^bb1(%c0_i32, %cst : i32, vector<4xf16>)
	^bb1(%10: i32, %11: vector<4xf16>): // 2 preds: ^bb0, ^bb2
	%12 = arith.cmpi slt, %10, %c4_i32 : i32
	cf.cond_br %12, ^bb2, ^bb3
	^bb2: // pred: ^bb1
	%13 = arith.index_castui %10 : i32 to index
	%14 = arith.addi %7, %10 : i32
	%15 = arith.index_castui %14 : i32 to index
	%16 = memref.load %1[%15, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
	%17 = vector.insertelement %16, %11[%13 : index] : vector<4xf16>
	%18 = arith.addi %10, %c1_i32 : i32
	cf.br ^bb1(%18, %17 : i32, vector<4xf16>)
	^bb3: // pred: ^bb1
	%19 = arith.extf %9 : vector<4xf16> to vector<4xf32>
	%20 = arith.extf %11 : vector<4xf16> to vector<4xf32>
	%21 = arith.mulf %20, %19 : vector<4xf32>
	%22 = arith.addf %21, %cst_0 : vector<4xf32>
	%23 = vector.reduction <add>, %22 : vector<4xf32> into f32
	%shuffleResult, %valid = gpu.shuffle xor %23, %c1_i32, %c64_i32 : f32
	%24 = arith.addf %23, %shuffleResult : f32
	%shuffleResult_2, %valid_3 = gpu.shuffle xor %24, %c2_i32, %c64_i32 : f32
	%25 = arith.addf %24, %shuffleResult_2 : f32
	%shuffleResult_4, %valid_5 = gpu.shuffle xor %25, %c4_i32, %c64_i32 : f32
	%26 = arith.addf %25, %shuffleResult_4 : f32
	%shuffleResult_6, %valid_7 = gpu.shuffle xor %26, %c8_i32, %c64_i32 : f32
	%27 = arith.addf %26, %shuffleResult_6 : f32
	%shuffleResult_8, %valid_9 = gpu.shuffle xor %27, %c16_i32, %c64_i32 : f32
	%28 = arith.addf %27, %shuffleResult_8 : f32
	%shuffleResult_10, %valid_11 = gpu.shuffle xor %28, %c32_i32, %c64_i32 : f32
	%29 = arith.addf %28, %shuffleResult_10 : f32
	%alloc = memref.alloc() : memref<11xf32, #gpu.address_space<workgroup>>
	%30 = arith.divui %6, %c64_i32 : i32
	%31 = arith.index_castui %30 : i32 to index
	%32 = arith.remui %6, %c64_i32 : i32
	%33 = arith.cmpi eq, %32, %c0_i32 : i32
	cf.cond_br %33, ^bb4, ^bb5
	^bb4: // pred: ^bb3
	memref.store %29, %alloc[%31] : memref<11xf32, #gpu.address_space<workgroup>>
	cf.br ^bb5
	^bb5: // 2 preds: ^bb3, ^bb4
	gpu.barrier
	%34 = arith.minui %32, %c10_i32 : i32
	%35 = arith.index_castui %34 : i32 to index
	%36 = memref.load %alloc[%35] : memref<11xf32, #gpu.address_space<workgroup>>
	%37 = arith.cmpi sge, %32, %c11_i32 : i32
	%38 = arith.select %37, %cst_1, %36 : f32
	%shuffleResult_12, %valid_13 = gpu.shuffle xor %38, %c1_i32, %c64_i32 : f32
	%39 = arith.addf %38, %shuffleResult_12 : f32
	%shuffleResult_14, %valid_15 = gpu.shuffle xor %39, %c2_i32, %c64_i32 : f32
	%40 = arith.addf %39, %shuffleResult_14 : f32
	%shuffleResult_16, %valid_17 = gpu.shuffle xor %40, %c4_i32, %c64_i32 : f32
	%41 = arith.addf %40, %shuffleResult_16 : f32
	%shuffleResult_18, %valid_19 = gpu.shuffle xor %41, %c8_i32, %c64_i32 : f32
	%42 = arith.addf %41, %shuffleResult_18 : f32
	%shuffleResult_20, %valid_21 = gpu.shuffle idx %42, %c0_i32, %c64_i32 : f32
	%43 = arith.addf %shuffleResult_20, %cst_1 : f32
	%44 = vector.splat %43 : vector<1xf32>
	%45 = arith.cmpi eq, %6, %c0_i32 : i32
	cf.cond_br %45, ^bb6, ^bb7
	^bb6: // pred: ^bb5
	vector.store %44, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
	cf.br ^bb7
	^bb7: // 2 preds: ^bb5, ^bb6
	return
	}
	}