Created
April 24, 2025 19:35
-
-
Save pashu123/1f3ec72853af4a88bb1324212c432534 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- // | |
module { | |
func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0_i32 = arith.constant 0 : i32 | |
%c256_i32 = arith.constant 256 : i32 | |
%c32_i32 = arith.constant 32 : i32 | |
%c16_i32 = arith.constant 16 : i32 | |
%c8_i32 = arith.constant 8 : i32 | |
%c4_i32 = arith.constant 4 : i32 | |
%c2_i32 = arith.constant 2 : i32 | |
%c64_i32 = arith.constant 64 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16> | |
%c0 = arith.constant 0 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x4xf32> | |
%thread_id_x = gpu.thread_id x upper_bound 704 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>> | |
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>> | |
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index | |
%6 = arith.index_castui %thread_id_x : index to i32 | |
%7 = arith.divui %6, %c64_i32 : i32 | |
%8 = arith.index_castui %7 : i32 to index | |
%9 = arith.remui %6, %c64_i32 : i32 | |
%10 = arith.muli %7, %c256_i32 : i32 | |
%11 = arith.muli %9, %c4_i32 : i32 | |
%12 = arith.addi %10, %11 : i32 | |
%13 = arith.index_castui %12 : i32 to index | |
cf.br ^bb1(%c0_i32, %cst_0 : i32, vector<4xf16>) | |
^bb1(%14: i32, %15: vector<4xf16>): // 2 preds: ^bb0, ^bb2 | |
%16 = arith.cmpi slt, %14, %c4_i32 : i32 | |
cf.cond_br %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%17 = arith.index_castui %14 : i32 to index | |
%18 = arith.addi %12, %14 : i32 | |
%19 = arith.index_castui %18 : i32 to index | |
%20 = memref.load %1[%19, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%21 = vector.insertelement %20, %15[%17 : index] : vector<4xf16> | |
%22 = arith.addi %14, %c1_i32 : i32 | |
cf.br ^bb1(%22, %21 : i32, vector<4xf16>) | |
^bb3: // pred: ^bb1 | |
%23 = vector.insert_strided_slice %15, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16> | |
%24 = vector.load %3[%workgroup_id_x, %13] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16> | |
%25 = vector.insert_strided_slice %24, %cst_2 {offsets = [0, 0, 0], strides = [1]} : vector<4xf16> into vector<1x1x4xf16> | |
%26 = arith.extf %23 : vector<1x1x4xf16> to vector<1x1x4xf32> | |
%27 = arith.extf %25 : vector<1x1x4xf16> to vector<1x1x4xf32> | |
%28 = arith.mulf %26, %27 : vector<1x1x4xf32> | |
%29 = arith.addf %28, %cst_3 : vector<1x1x4xf32> | |
%30 = vector.extract %29[0, 0, 0] : f32 from vector<1x1x4xf32> | |
%31 = vector.extract %29[0, 0, 1] : f32 from vector<1x1x4xf32> | |
%32 = vector.extract %29[0, 0, 2] : f32 from vector<1x1x4xf32> | |
%33 = vector.extract %29[0, 0, 3] : f32 from vector<1x1x4xf32> | |
%34 = arith.addf %30, %cst : f32 | |
%35 = arith.addf %31, %34 : f32 | |
%36 = arith.addf %32, %35 : f32 | |
%37 = arith.addf %33, %36 : f32 | |
%shuffleResult, %valid = gpu.shuffle xor %37, %c1_i32, %c64_i32 : f32 | |
%38 = arith.addf %37, %shuffleResult : f32 | |
%shuffleResult_4, %valid_5 = gpu.shuffle xor %38, %c2_i32, %c64_i32 : f32 | |
%39 = arith.addf %38, %shuffleResult_4 : f32 | |
%shuffleResult_6, %valid_7 = gpu.shuffle xor %39, %c4_i32, %c64_i32 : f32 | |
%40 = arith.addf %39, %shuffleResult_6 : f32 | |
%shuffleResult_8, %valid_9 = gpu.shuffle xor %40, %c8_i32, %c64_i32 : f32 | |
%41 = arith.addf %40, %shuffleResult_8 : f32 | |
%shuffleResult_10, %valid_11 = gpu.shuffle xor %41, %c16_i32, %c64_i32 : f32 | |
%42 = arith.addf %41, %shuffleResult_10 : f32 | |
%shuffleResult_12, %valid_13 = gpu.shuffle xor %42, %c32_i32, %c64_i32 : f32 | |
%43 = arith.addf %42, %shuffleResult_12 : f32 | |
%44 = vector.insert %43, %cst_1 [0] : f32 into vector<1xf32> | |
%alloc = memref.alloc() : memref<13xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%45 = arith.cmpi eq, %9, %c0_i32 : i32 | |
cf.cond_br %45, ^bb4, ^bb5 | |
^bb4: // pred: ^bb3 | |
vector.store %44, %alloc[%8] : memref<13xf32, #gpu.address_space<workgroup>>, vector<1xf32> | |
cf.br ^bb5 | |
^bb5: // 2 preds: ^bb3, ^bb4 | |
gpu.barrier | |
%46 = vector.load %alloc[%c0] : memref<13xf32, #gpu.address_space<workgroup>>, vector<11xf32> | |
%47 = vector.extract %46[0] : f32 from vector<11xf32> | |
%48 = vector.extract %46[1] : f32 from vector<11xf32> | |
%49 = vector.extract %46[2] : f32 from vector<11xf32> | |
%50 = vector.extract %46[3] : f32 from vector<11xf32> | |
%51 = vector.extract %46[4] : f32 from vector<11xf32> | |
%52 = vector.extract %46[5] : f32 from vector<11xf32> | |
%53 = vector.extract %46[6] : f32 from vector<11xf32> | |
%54 = vector.extract %46[7] : f32 from vector<11xf32> | |
%55 = vector.extract %46[8] : f32 from vector<11xf32> | |
%56 = vector.extract %46[9] : f32 from vector<11xf32> | |
%57 = vector.extract %46[10] : f32 from vector<11xf32> | |
%58 = arith.addf %47, %cst : f32 | |
%59 = arith.addf %48, %58 : f32 | |
%60 = arith.addf %49, %59 : f32 | |
%61 = arith.addf %50, %60 : f32 | |
%62 = arith.addf %51, %61 : f32 | |
%63 = arith.addf %52, %62 : f32 | |
%64 = arith.addf %53, %63 : f32 | |
%65 = arith.addf %54, %64 : f32 | |
%66 = arith.addf %55, %65 : f32 | |
%67 = arith.addf %56, %66 : f32 | |
%68 = arith.addf %57, %67 : f32 | |
%69 = arith.addf %68, %cst : f32 | |
%70 = vector.splat %69 : vector<f32> | |
%71 = arith.cmpi eq, %6, %c0_i32 : i32 | |
cf.cond_br %71, ^bb6, ^bb7 | |
^bb6: // pred: ^bb5 | |
vector.store %70, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<f32> | |
cf.br ^bb7 | |
^bb7: // 2 preds: ^bb5, ^bb6 | |
return | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() { | |
%c11_i32 = arith.constant 11 : i32 | |
%c10_i32 = arith.constant 10 : i32 | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i32 = arith.constant 32 : i32 | |
%c16_i32 = arith.constant 16 : i32 | |
%c8_i32 = arith.constant 8 : i32 | |
%c4_i32 = arith.constant 4 : i32 | |
%c2_i32 = arith.constant 2 : i32 | |
%c64_i32 = arith.constant 64 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%thread_id_x = gpu.thread_id x upper_bound 704 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2816x2xf16, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<2816x2xf16, #gpu.address_space<global>> to memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1280x2816xf16, #gpu.address_space<global>> | |
%3 = amdgpu.fat_raw_buffer_cast %2 resetOffset : memref<1280x2816xf16, #gpu.address_space<global>> to memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %3, 64 : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%4 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x1280xf32, #gpu.address_space<global>> | |
%5 = amdgpu.fat_raw_buffer_cast %4 resetOffset : memref<2x1280xf32, #gpu.address_space<global>> to memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %5, 64 : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>> | |
%workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 1280 : index | |
%6 = arith.index_castui %thread_id_x : index to i32 | |
%7 = arith.muli %6, %c4_i32 overflow<nsw> : i32 | |
%8 = arith.index_castui %7 : i32 to index | |
%9 = vector.load %3[%workgroup_id_x, %8] : memref<1280x2816xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16> | |
cf.br ^bb1(%c0_i32, %cst : i32, vector<4xf16>) | |
^bb1(%10: i32, %11: vector<4xf16>): // 2 preds: ^bb0, ^bb2 | |
%12 = arith.cmpi slt, %10, %c4_i32 : i32 | |
cf.cond_br %12, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%13 = arith.index_castui %10 : i32 to index | |
%14 = arith.addi %7, %10 : i32 | |
%15 = arith.index_castui %14 : i32 to index | |
%16 = memref.load %1[%15, %workgroup_id_y] : memref<2816x2xf16, #amdgpu.address_space<fat_raw_buffer>> | |
%17 = vector.insertelement %16, %11[%13 : index] : vector<4xf16> | |
%18 = arith.addi %10, %c1_i32 : i32 | |
cf.br ^bb1(%18, %17 : i32, vector<4xf16>) | |
^bb3: // pred: ^bb1 | |
%19 = arith.extf %9 : vector<4xf16> to vector<4xf32> | |
%20 = arith.extf %11 : vector<4xf16> to vector<4xf32> | |
%21 = arith.mulf %20, %19 : vector<4xf32> | |
%22 = arith.addf %21, %cst_0 : vector<4xf32> | |
%23 = vector.reduction <add>, %22 : vector<4xf32> into f32 | |
%shuffleResult, %valid = gpu.shuffle xor %23, %c1_i32, %c64_i32 : f32 | |
%24 = arith.addf %23, %shuffleResult : f32 | |
%shuffleResult_2, %valid_3 = gpu.shuffle xor %24, %c2_i32, %c64_i32 : f32 | |
%25 = arith.addf %24, %shuffleResult_2 : f32 | |
%shuffleResult_4, %valid_5 = gpu.shuffle xor %25, %c4_i32, %c64_i32 : f32 | |
%26 = arith.addf %25, %shuffleResult_4 : f32 | |
%shuffleResult_6, %valid_7 = gpu.shuffle xor %26, %c8_i32, %c64_i32 : f32 | |
%27 = arith.addf %26, %shuffleResult_6 : f32 | |
%shuffleResult_8, %valid_9 = gpu.shuffle xor %27, %c16_i32, %c64_i32 : f32 | |
%28 = arith.addf %27, %shuffleResult_8 : f32 | |
%shuffleResult_10, %valid_11 = gpu.shuffle xor %28, %c32_i32, %c64_i32 : f32 | |
%29 = arith.addf %28, %shuffleResult_10 : f32 | |
%alloc = memref.alloc() : memref<11xf32, #gpu.address_space<workgroup>> | |
%30 = arith.divui %6, %c64_i32 : i32 | |
%31 = arith.index_castui %30 : i32 to index | |
%32 = arith.remui %6, %c64_i32 : i32 | |
%33 = arith.cmpi eq, %32, %c0_i32 : i32 | |
cf.cond_br %33, ^bb4, ^bb5 | |
^bb4: // pred: ^bb3 | |
memref.store %29, %alloc[%31] : memref<11xf32, #gpu.address_space<workgroup>> | |
cf.br ^bb5 | |
^bb5: // 2 preds: ^bb3, ^bb4 | |
gpu.barrier | |
%34 = arith.minui %32, %c10_i32 : i32 | |
%35 = arith.index_castui %34 : i32 to index | |
%36 = memref.load %alloc[%35] : memref<11xf32, #gpu.address_space<workgroup>> | |
%37 = arith.cmpi sge, %32, %c11_i32 : i32 | |
%38 = arith.select %37, %cst_1, %36 : f32 | |
%shuffleResult_12, %valid_13 = gpu.shuffle xor %38, %c1_i32, %c64_i32 : f32 | |
%39 = arith.addf %38, %shuffleResult_12 : f32 | |
%shuffleResult_14, %valid_15 = gpu.shuffle xor %39, %c2_i32, %c64_i32 : f32 | |
%40 = arith.addf %39, %shuffleResult_14 : f32 | |
%shuffleResult_16, %valid_17 = gpu.shuffle xor %40, %c4_i32, %c64_i32 : f32 | |
%41 = arith.addf %40, %shuffleResult_16 : f32 | |
%shuffleResult_18, %valid_19 = gpu.shuffle xor %41, %c8_i32, %c64_i32 : f32 | |
%42 = arith.addf %41, %shuffleResult_18 : f32 | |
%shuffleResult_20, %valid_21 = gpu.shuffle idx %42, %c0_i32, %c64_i32 : f32 | |
%43 = arith.addf %shuffleResult_20, %cst_1 : f32 | |
%44 = vector.splat %43 : vector<1xf32> | |
%45 = arith.cmpi eq, %6, %c0_i32 : i32 | |
cf.cond_br %45, ^bb6, ^bb7 | |
^bb6: // pred: ^bb5 | |
vector.store %44, %5[%workgroup_id_y, %workgroup_id_x] : memref<2x1280xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32> | |
cf.br ^bb7 | |
^bb7: // 2 preds: ^bb5, ^bb6 | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment