March 5, 2025 19:49 · March 4, 2025 20:04 · March 3, 2025 17:40 · March 3, 2025 17:19 · February 24, 2025 20:05 · February 8, 2025 03:28
 // -----// IR Dump After CSE (cse) //----- //
 func.func @__builtin_fill_i64(%arg0: !util.buffer, %arg1: !util.buffer, %arg2: !util.list<!util.buffer>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {translation_info = #iree_codegen.translation_info<pipeline = VMVXDefault>} {
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %c1 = arith.constant 1 : index
  %buffer_size = util.buffer.size %arg1 : !util.buffer
  %0 = util.buffer.load %arg1[%c0 for %c4] : !util.buffer{%buffer_size} -> i32
 diff --git a/home/benjacob/interm-good/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll b/home/benjacob/interm-bad/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
 index 3aa3582..d2834a5 100644
 --- a/home/benjacob/interm-good/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
 +++ b/home/benjacob/interm-bad/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
 @@ -43173,65 +43173,53 @@ define amdgpu_kernel void @"main$async_dispatch_57_elementwise_2x4096x2560_f16xf
   %49 = load <8 x half>, ptr addrspace(1) %48, align 2
   %50 = load <1 x float>, ptr addrspace(1) %2, align 4
   %51 = fdiv <8 x half> %47, %19
 -  %52 = fcmp olt <8 x half> %51, zeroinitializer
 -  %53 = fneg <8 x half> %51

 // -----// IR Dump After CSE (cse) //----- //
 func.func @_check_reordering_dispatch_0_generic_384_f32() {
  %0 = ub.poison : vector<f32>
  %cst = arith.constant dense<-0.000000e+00> : vector<1xf32>
  %c4 = arith.constant 4 : index
  %c384 = arith.constant 384 : index
  %c0 = arith.constant 0 : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32>
  memref.assume_alignment %1, 64 : memref<384xf32>
 ; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(none)
 define linkonce_odr protected float @__ocml_erf_f32(float noundef %0) local_unnamed_addr #2 {
  %2 = tail call float @llvm.fabs.f32(float %0)
  %3 = fcmp olt float %2, 1.000000e+00
  br i1 %3, label %4, label %12

 4:                                                ; preds = %1
  %5 = fmul float %0, %0
  %6 = tail call float @__ocml_fmuladd_f32(float noundef %5, float noundef 0xBF4268BC20000000, float noundef 0x3F74208280000000) #15
  %7 = tail call float @__ocml_fmuladd_f32(float noundef %5, float noundef %6, float noundef 0xBF9B593700000000) #15

 // -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
 func.func @matmul_i8_dispatch_3() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
  %c16384_i32 = arith.constant 16384 : i32
  %c4_i32 = arith.constant 4 : i32
  %c1_i32 = arith.constant 1 : i32
  %c8_i32 = arith.constant 8 : i32
  %c2_i32 = arith.constant 2 : i32
  %c0 = arith.constant 0 : index
  %c16384 = arith.constant 16384 : index
	// -----// IR Dump After CSE (cse) //----- //
	func.func @__builtin_fill_i64(%arg0: !util.buffer, %arg1: !util.buffer, %arg2: !util.list<!util.buffer>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {translation_info = #iree_codegen.translation_info<pipeline = VMVXDefault>} {
	%c12 = arith.constant 12 : index
	%c8 = arith.constant 8 : index
	%c4 = arith.constant 4 : index
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%c1 = arith.constant 1 : index
	%buffer_size = util.buffer.size %arg1 : !util.buffer
	%0 = util.buffer.load %arg1[%c0 for %c4] : !util.buffer{%buffer_size} -> i32
	diff --git a/home/benjacob/interm-good/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll b/home/benjacob/interm-bad/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
	index 3aa3582..d2834a5 100644
	--- a/home/benjacob/interm-good/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
	+++ b/home/benjacob/interm-bad/compiled_punet_compiled_punet_linked_rocm_hsaco_fb.linked.ll
	@@ -43173,65 +43173,53 @@ define amdgpu_kernel void @"main$async_dispatch_57_elementwise_2x4096x2560_f16xf
	%49 = load <8 x half>, ptr addrspace(1) %48, align 2
	%50 = load <1 x float>, ptr addrspace(1) %2, align 4
	%51 = fdiv <8 x half> %47, %19
	- %52 = fcmp olt <8 x half> %51, zeroinitializer
	- %53 = fneg <8 x half> %51

	// -----// IR Dump After CSE (cse) //----- //
	func.func @_check_reordering_dispatch_0_generic_384_f32() {
	%0 = ub.poison : vector<f32>
	%cst = arith.constant dense<-0.000000e+00> : vector<1xf32>
	%c4 = arith.constant 4 : index
	%c384 = arith.constant 384 : index
	%c0 = arith.constant 0 : index
	%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32>
	memref.assume_alignment %1, 64 : memref<384xf32>
	; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(none)
	define linkonce_odr protected float @__ocml_erf_f32(float noundef %0) local_unnamed_addr #2 {
	%2 = tail call float @llvm.fabs.f32(float %0)
	%3 = fcmp olt float %2, 1.000000e+00
	br i1 %3, label %4, label %12

	4: ; preds = %1
	%5 = fmul float %0, %0
	%6 = tail call float @__ocml_fmuladd_f32(float noundef %5, float noundef 0xBF4268BC20000000, float noundef 0x3F74208280000000) #15
	%7 = tail call float @__ocml_fmuladd_f32(float noundef %5, float noundef %6, float noundef 0xBF9B593700000000) #15

	// -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
	func.func @matmul_i8_dispatch_3() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
	%c16384_i32 = arith.constant 16384 : i32
	%c4_i32 = arith.constant 4 : i32
	%c1_i32 = arith.constant 1 : i32
	%c8_i32 = arith.constant 8 : i32
	%c2_i32 = arith.constant 2 : i32
	%c0 = arith.constant 0 : index
	%c16384 = arith.constant 16384 : index