Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created April 24, 2025 19:35
Show Gist options
  • Save pashu123/1f3ec72853af4a88bb1324212c432534 to your computer and use it in GitHub Desktop.
Save pashu123/1f3ec72853af4a88bb1324212c432534 to your computer and use it in GitHub Desktop.
// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
module {
func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
%cst = arith.constant 0.000000e+00 : f32
%c0_i32 = arith.constant 0 : i32
%c256_i32 = arith.constant 256 : i32
%c32_i32 = arith.constant 32 : i32
%c16_i32 = arith.constant 16 : i32
%c8_i32 = arith.constant 8 : i32
%c4_i32 = arith.constant 4 : i32
%c2_i32 = arith.constant 2 : i32
%c64_i32 = arith.constant 64 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<1xf32>
%cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
%c0 = arith.constant 0 : index
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32>
%thread_id_x = gpu.thread_id x upper_bound 704
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
%6 = arith.index_castui %thread_id_x : index to i32
%7 = arith.divui %6, %c64_i32 : i32
%8 = arith.index_castui %7 : i32 to index
%9 = arith.remui %6, %c64_i32 : i32
%10 = arith.muli %7, %c256_i32 : i32
%11 = arith.muli %9, %c4_i32 : i32
%12 = arith.addi %10, %11 : i32
%13 = arith.index_castui %12 : i32 to index
cf.br ^bb1(%c0_i32, %cst_0 : i32, vector<4xf16>)
^bb1(%14: i32, %15: vector<4xf16>): // 2 preds: ^bb0, ^bb2
%16 = arith.cmpi slt, %14, %c4_i32 : i32
cf.cond_br %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
%17 = arith.index_castui %14 : i32 to index
%18 = arith.addi %12, %14 : i32
%19 = arith.index_castui %18 : i32 to index
%20 = memref.load %1[%19, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
%21 = vector.insertelement %20, %15[%17 : index] : vector<4xf16>
%22 = arith.addi %14, %c1_i32 : i32
cf.br ^bb1(%22, %21 : i32, vector<4xf16>)
^bb3: // pred: ^bb1
%23 = vector.insert_strided_slice %15, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
%24 = vector.load %3[%workgroup_id_x, %13] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
%25 = vector.insert_strided_slice %24, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16>
%26 = arith.extf %23 : vector<1x1x4xf16> to vector<1x1x4xf32>
%27 = arith.extf %25 : vector<1x1x4xf16> to vector<1x1x4xf32>
%28 = arith.mulf %26, %27 : vector<1x1x4xf32>
%29 = arith.addf %28, %cst_3 : vector<1x1x4xf32>
%30 = vector.extract %29[0, 0, 0] : f32 from vector<1x1x4xf32>
%31 = vector.extract %29[0, 0, 1] : f32 from vector<1x1x4xf32>
%32 = vector.extract %29[0, 0, 2] : f32 from vector<1x1x4xf32>
%33 = vector.extract %29[0, 0, 3] : f32 from vector<1x1x4xf32>
%34 = arith.addf %30, %cst : f32
%35 = arith.addf %31, %34 : f32
%36 = arith.addf %32, %35 : f32
%37 = arith.addf %33, %36 : f32
%shuffleResult, %valid = gpu.shuffle xor %37, %c1_i32, %c64_i32 : f32
%38 = arith.addf %37, %shuffleResult : f32
%shuffleResult_4, %valid_5 = gpu.shuffle xor %38, %c2_i32, %c64_i32 : f32
%39 = arith.addf %38, %shuffleResult_4 : f32
%shuffleResult_6, %valid_7 = gpu.shuffle xor %39, %c4_i32, %c64_i32 : f32
%40 = arith.addf %39, %shuffleResult_6 : f32
%shuffleResult_8, %valid_9 = gpu.shuffle xor %40, %c8_i32, %c64_i32 : f32
%41 = arith.addf %40, %shuffleResult_8 : f32
%shuffleResult_10, %valid_11 = gpu.shuffle xor %41, %c16_i32, %c64_i32 : f32
%42 = arith.addf %41, %shuffleResult_10 : f32
%shuffleResult_12, %valid_13 = gpu.shuffle xor %42, %c32_i32, %c64_i32 : f32
%43 = arith.addf %42, %shuffleResult_12 : f32
%44 = vector.insert %43, %cst_1 [0] : f32 into vector<1xf32>
%alloc = memref.alloc() : memref<13xf32, #gpu.address_space<workgroup>>
gpu.barrier
%45 = arith.cmpi eq, %9, %c0_i32 : i32
cf.cond_br %45, ^bb4, ^bb5
^bb4: // pred: ^bb3
vector.store %44, %alloc[%8] : memref<13xf32, #gpu.address_space<workgroup>>, vector<1xf32>
cf.br ^bb5
^bb5: // 2 preds: ^bb3, ^bb4
gpu.barrier
%46 = vector.load %alloc[%c0] : memref<13xf32, #gpu.address_space<workgroup>>, vector<11xf32>
%47 = vector.extract %46[0] : f32 from vector<11xf32>
%48 = vector.extract %46[1] : f32 from vector<11xf32>
%49 = vector.extract %46[2] : f32 from vector<11xf32>
%50 = vector.extract %46[3] : f32 from vector<11xf32>
%51 = vector.extract %46[4] : f32 from vector<11xf32>
%52 = vector.extract %46[5] : f32 from vector<11xf32>
%53 = vector.extract %46[6] : f32 from vector<11xf32>
%54 = vector.extract %46[7] : f32 from vector<11xf32>
%55 = vector.extract %46[8] : f32 from vector<11xf32>
%56 = vector.extract %46[9] : f32 from vector<11xf32>
%57 = vector.extract %46[10] : f32 from vector<11xf32>
%58 = arith.addf %47, %cst : f32
%59 = arith.addf %48, %58 : f32
%60 = arith.addf %49, %59 : f32
%61 = arith.addf %50, %60 : f32
%62 = arith.addf %51, %61 : f32
%63 = arith.addf %52, %62 : f32
%64 = arith.addf %53, %63 : f32
%65 = arith.addf %54, %64 : f32
%66 = arith.addf %55, %65 : f32
%67 = arith.addf %56, %66 : f32
%68 = arith.addf %57, %67 : f32
%69 = arith.addf %68, %cst : f32
%70 = vector.splat %69 : vector<f32>
%71 = arith.cmpi eq, %6, %c0_i32 : i32
cf.cond_br %71, ^bb6, ^bb7
^bb6: // pred: ^bb5
vector.store %70, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<f32>
cf.br ^bb7
^bb7: // 2 preds: ^bb5, ^bb6
return
}
}
func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
%c11_i32 = arith.constant 11 : i32
%c10_i32 = arith.constant 10 : i32
%cst = arith.constant dense<0.000000e+00> : vector<4xf16>
%c0_i32 = arith.constant 0 : i32
%c32_i32 = arith.constant 32 : i32
%c16_i32 = arith.constant 16 : i32
%c8_i32 = arith.constant 8 : i32
%c4_i32 = arith.constant 4 : i32
%c2_i32 = arith.constant 2 : i32
%c64_i32 = arith.constant 64 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%thread_id_x = gpu.thread_id x upper_bound 704
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>>
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>>
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>>
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index
%6 = arith.index_castui %thread_id_x : index to i32
%7 = arith.muli %6, %c4_i32 overflow<nsw> : i32
%8 = arith.index_castui %7 : i32 to index
%9 = vector.load %3[%workgroup_id_x, %8] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
cf.br ^bb1(%c0_i32, %cst : i32, vector<4xf16>)
^bb1(%10: i32, %11: vector<4xf16>): // 2 preds: ^bb0, ^bb2
%12 = arith.cmpi slt, %10, %c4_i32 : i32
cf.cond_br %12, ^bb2, ^bb3
^bb2: // pred: ^bb1
%13 = arith.index_castui %10 : i32 to index
%14 = arith.addi %7, %10 : i32
%15 = arith.index_castui %14 : i32 to index
%16 = memref.load %1[%15, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>>
%17 = vector.insertelement %16, %11[%13 : index] : vector<4xf16>
%18 = arith.addi %10, %c1_i32 : i32
cf.br ^bb1(%18, %17 : i32, vector<4xf16>)
^bb3: // pred: ^bb1
%19 = arith.extf %9 : vector<4xf16> to vector<4xf32>
%20 = arith.extf %11 : vector<4xf16> to vector<4xf32>
%21 = arith.mulf %20, %19 : vector<4xf32>
%22 = arith.addf %21, %cst_0 : vector<4xf32>
%23 = vector.reduction <add>, %22 : vector<4xf32> into f32
%shuffleResult, %valid = gpu.shuffle xor %23, %c1_i32, %c64_i32 : f32
%24 = arith.addf %23, %shuffleResult : f32
%shuffleResult_2, %valid_3 = gpu.shuffle xor %24, %c2_i32, %c64_i32 : f32
%25 = arith.addf %24, %shuffleResult_2 : f32
%shuffleResult_4, %valid_5 = gpu.shuffle xor %25, %c4_i32, %c64_i32 : f32
%26 = arith.addf %25, %shuffleResult_4 : f32
%shuffleResult_6, %valid_7 = gpu.shuffle xor %26, %c8_i32, %c64_i32 : f32
%27 = arith.addf %26, %shuffleResult_6 : f32
%shuffleResult_8, %valid_9 = gpu.shuffle xor %27, %c16_i32, %c64_i32 : f32
%28 = arith.addf %27, %shuffleResult_8 : f32
%shuffleResult_10, %valid_11 = gpu.shuffle xor %28, %c32_i32, %c64_i32 : f32
%29 = arith.addf %28, %shuffleResult_10 : f32
%alloc = memref.alloc() : memref<11xf32, #gpu.address_space<workgroup>>
%30 = arith.divui %6, %c64_i32 : i32
%31 = arith.index_castui %30 : i32 to index
%32 = arith.remui %6, %c64_i32 : i32
%33 = arith.cmpi eq, %32, %c0_i32 : i32
cf.cond_br %33, ^bb4, ^bb5
^bb4: // pred: ^bb3
memref.store %29, %alloc[%31] : memref<11xf32, #gpu.address_space<workgroup>>
cf.br ^bb5
^bb5: // 2 preds: ^bb3, ^bb4
gpu.barrier
%34 = arith.minui %32, %c10_i32 : i32
%35 = arith.index_castui %34 : i32 to index
%36 = memref.load %alloc[%35] : memref<11xf32, #gpu.address_space<workgroup>>
%37 = arith.cmpi sge, %32, %c11_i32 : i32
%38 = arith.select %37, %cst_1, %36 : f32
%shuffleResult_12, %valid_13 = gpu.shuffle xor %38, %c1_i32, %c64_i32 : f32
%39 = arith.addf %38, %shuffleResult_12 : f32
%shuffleResult_14, %valid_15 = gpu.shuffle xor %39, %c2_i32, %c64_i32 : f32
%40 = arith.addf %39, %shuffleResult_14 : f32
%shuffleResult_16, %valid_17 = gpu.shuffle xor %40, %c4_i32, %c64_i32 : f32
%41 = arith.addf %40, %shuffleResult_16 : f32
%shuffleResult_18, %valid_19 = gpu.shuffle xor %41, %c8_i32, %c64_i32 : f32
%42 = arith.addf %41, %shuffleResult_18 : f32
%shuffleResult_20, %valid_21 = gpu.shuffle idx %42, %c0_i32, %c64_i32 : f32
%43 = arith.addf %shuffleResult_20, %cst_1 : f32
%44 = vector.splat %43 : vector<1xf32>
%45 = arith.cmpi eq, %6, %c0_i32 : i32
cf.cond_br %45, ^bb6, ^bb7
^bb6: // pred: ^bb5
vector.store %44, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
cf.br ^bb7
^bb7: // 2 preds: ^bb5, ^bb6
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment