d0k · April 24, 2012 18:39
diff --git a/gistfile1.diff b/gistfile1.diff
 commit 99ea7d9b3d64c51694b8cc35c6bd25b4945ab0c2
 Author: Benjamin Kramer <[email protected]>
 Date:   Wed Apr 25 12:39:50 2012 +0200

    X86: Turn cmovs into branches when profitable.
    
    This came up when a change in block placement formed a cmov and slowed down a
    hot loop by 50%:
    
    	ucomisd	(%rdi), %xmm0
    	cmovbel	%edx, %esi
    
    cmov is a really bad choice in this context because it doesn't get branch
    prediction. If we emit it as a branch, an out-of-order CPU can do a better job
    (if the branch is predicted right) and avoid waiting for the slow load+compare
    instruction to finish. Of course it won't help if the branch is unpredictable,
    but those are really rare in practice.
    
    As a heuristic turn all cmovs that have one use and a direct memory operand into
    branches. cmovs are usually save some code size, so disable the transform in -Os
    mode and on atom, which is an in-order microarchitecture and unlikely to benefit
    from this.
    
    Test suite shows a 9% improvement on CoyoteBench/huffbench. I'm not aware of any
    significant execution time regressions. The results depend a lot on the used
    microarchitecture so YMMV.
    
    There is probably still more to get out of this, as LLVM really likes setcc and
    cmov, sometimes forming long chains of them which are deadly for modern CPUs.
    
    Thanks to Chandler Carruth for the initial test case and providing me with
    stable test-suite numbers :)

 diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
 index 0f99844..628de51 100644
 --- a/lib/Target/X86/X86ISelLowering.cpp
 +++ b/lib/Target/X86/X86ISelLowering.cpp
 @@ -12483,6 +12483,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
 +  case X86::CMOV_GR16_EXP:
 +  case X86::CMOV_GR32_EXP:
 +  case X86::CMOV_GR64_EXP:
     return EmitLoweredSelect(MI, BB);
 
   case X86::FP32_TO_INT16_IN_MEM:
 diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
 index 6f9e849..bdcb063 100644
 --- a/lib/Target/X86/X86InstrCompiler.td
 +++ b/lib/Target/X86/X86InstrCompiler.td
 @@ -421,6 +421,35 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions
 
 +// Fragment to identify cmovs that are better emitted as an explicit branch.
 +def expensive_cmov : PatFrag<(ops node:$src1, node:$src2, node:$cond,
 +                              node:$flags),
 +                             (X86cmov node:$src1, node:$src2, node:$cond,
 +                              node:$flags),[{
 +  // cmov is usually denser than branches. Atom is an in-order cpu so branches
 +  // can't give a big improvement there.
 +  if (OptForSize || Subtarget->isAtom())
 +    return false;
 +
 +  // If the branch is predicted right, an out of order CPU can avoid blocking on
 +  // the compare.  Emit cmovs on compares with a memory operand as branches to
 +  // avoid stalls on the load from memory.  If the compare has more than one use
 +  // there's probably another cmov or setcc around so it's not worth emitting a
 +  // branch.
 +  SDValue Cmp = N->getOperand(3);
 +  if (Cmp->getOpcode() == X86ISD::CMP && Cmp->hasOneUse()) {
 +    SDValue Cmp0 = Cmp->getOperand(0);
 +    SDValue Cmp1 = Cmp->getOperand(1);
 +
 +    // We check that the memory operand has one use to avoid uses of the loaded
 +    // value directly after the compare, making branches unprofitable.
 +    return ((isa<LoadSDNode>(Cmp0) && Cmp0->hasOneUse()) ||
 +            (isa<LoadSDNode>(Cmp1) && Cmp1->hasOneUse()));
 +  }
 +
 +  return false;
 +}]>;
 +
 // X86 doesn't have 8-bit conditional moves. Use a customInserter to
 // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
 // however that requires promoting the operands, and can induce additional
 @@ -465,6 +494,27 @@ def CMOV_RFP80 : I<0, Pseudo,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
 } // Predicates = [NoCMov]
 +
 +// Patterns for cmovs that are better emitted as a branch. See expensive_cmov
 +// above for the conditions.
 +let AddedComplexity = 2 in { // Override patterns below.
 +def CMOV_GR32_EXP : I<0, Pseudo,
 +                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
 +                    "#CMOV_GR32_EXP* PSEUDO!",
 +                    [(set GR32:$dst,
 +                   (expensive_cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
 +def CMOV_GR16_EXP : I<0, Pseudo,
 +                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
 +                    "#CMOV_GR16_EXP* PSEUDO!",
 +                    [(set GR16:$dst,
 +                   (expensive_cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
 +def CMOV_GR64_EXP : I<0, Pseudo,
 +                    (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$cond),
 +                    "#CMOV_GR64_EXP* PSEUDO!",
 +                    [(set GR64:$dst,
 +                   (expensive_cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>;
 +} // AddedComplexity = 2
 +
 } // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
 diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
 index 266fd7b..37f74bd 100644
 --- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
 +++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
 @@ -10,7 +10,7 @@
 
 	%struct.indexentry = type { i32, i8*, i8*, i8*, i8*, i8* }
 
 -define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind  {
 +define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind optsize {
 entry:
 	%tmp910 = add i32 0, %offset		; <i32> [#uses=1]
 	br i1 true, label %bb951, label %bb917
 diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
 new file mode 100644
 index 0000000..780746a
 --- /dev/null
 +++ b/test/CodeGen/X86/cmov-into-branch.ll
 @@ -0,0 +1,63 @@
 +; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
 +
 +; cmp with single-use load, should not form cmov.
 +define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
 +  %load = load double* %b, align 8
 +  %cmp = fcmp olt double %load, %a
 +  %cond = select i1 %cmp, i32 %x, i32 %y
 +  ret i32 %cond
 +; CHECK: test1:
 +; CHECK: ucomisd
 +; CHECK-NOT: cmov
 +; CHECK: j
 +; CHECK-NOT: cmov
 +}
 +
 +; Sanity check: no load.
 +define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
 +  %cmp = fcmp ogt double %a, %b
 +  %cond = select i1 %cmp, i32 %x, i32 %y
 +  ret i32 %cond
 +; CHECK: test2:
 +; CHECK: ucomisd
 +; CHECK: cmov
 +}
 +
 +; Multiple uses of %a, should not form cmov.
 +define i32 @test3(i32 %a, i32* nocapture %b, i32 %x)  {
 +  %load = load i32* %b, align 4
 +  %cmp = icmp ult i32 %load, %a
 +  %cond = select i1 %cmp, i32 %a, i32 %x
 +  ret i32 %cond
 +; CHECK: test3:
 +; CHECK: cmpl
 +; CHECK-NOT: cmov
 +; CHECK: j
 +; CHECK-NOT: cmov
 +}
 +
 +; Multiple uses of the load.
 +define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y)  {
 +  %load = load i32* %b, align 4
 +  %cmp = icmp ult i32 %load, %a
 +  %cond = select i1 %cmp, i32 %x, i32 %y
 +  %add = add i32 %cond, %load
 +  ret i32 %add
 +; CHECK: test4:
 +; CHECK: cmpl
 +; CHECK: cmov
 +}
 +
 +; Multiple uses of the cmp.
 +define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
 +  %load = load i32* %b, align 4
 +  %cmp = icmp ult i32 %load, %a
 +  %cmp1 = icmp ugt i32 %load, %a
 +  %cond = select i1 %cmp1, i32 %a, i32 %y
 +  %cond5 = select i1 %cmp, i32 %cond, i32 %x
 +  ret i32 %cond5
 +; CHECK: test5:
 +; CHECK: cmpl
 +; CHECK: cmov
 +; CHECK: cmov
 +}
	commit 99ea7d9b3d64c51694b8cc35c6bd25b4945ab0c2
	Author: Benjamin Kramer <[email protected]>
	Date: Wed Apr 25 12:39:50 2012 +0200

	X86: Turn cmovs into branches when profitable.

	This came up when a change in block placement formed a cmov and slowed down a
	hot loop by 50%:

	ucomisd (%rdi), %xmm0
	cmovbel %edx, %esi

	cmov is a really bad choice in this context because it doesn't get branch
	prediction. If we emit it as a branch, an out-of-order CPU can do a better job
	(if the branch is predicted right) and avoid waiting for the slow load+compare
	instruction to finish. Of course it won't help if the branch is unpredictable,
	but those are really rare in practice.

	As a heuristic turn all cmovs that have one use and a direct memory operand into
	branches. cmovs are usually save some code size, so disable the transform in -Os
	mode and on atom, which is an in-order microarchitecture and unlikely to benefit
	from this.

	Test suite shows a 9% improvement on CoyoteBench/huffbench. I'm not aware of any
	significant execution time regressions. The results depend a lot on the used
	microarchitecture so YMMV.

	There is probably still more to get out of this, as LLVM really likes setcc and
	cmov, sometimes forming long chains of them which are deadly for modern CPUs.

	Thanks to Chandler Carruth for the initial test case and providing me with
	stable test-suite numbers :)

	diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
	index 0f99844..628de51 100644
	--- a/lib/Target/X86/X86ISelLowering.cpp
	+++ b/lib/Target/X86/X86ISelLowering.cpp
	@@ -12483,6 +12483,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	+ case X86::CMOV_GR16_EXP:
	+ case X86::CMOV_GR32_EXP:
	+ case X86::CMOV_GR64_EXP:
	return EmitLoweredSelect(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
	index 6f9e849..bdcb063 100644
	--- a/lib/Target/X86/X86InstrCompiler.td
	+++ b/lib/Target/X86/X86InstrCompiler.td
	@@ -421,6 +421,35 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	//===----------------------------------------------------------------------===//
	// Conditional Move Pseudo Instructions

	+// Fragment to identify cmovs that are better emitted as an explicit branch.
	+def expensive_cmov : PatFrag<(ops node:$src1, node:$src2, node:$cond,
	+ node:$flags),
	+ (X86cmov node:$src1, node:$src2, node:$cond,
	+ node:$flags),[{
	+ // cmov is usually denser than branches. Atom is an in-order cpu so branches
	+ // can't give a big improvement there.
	+ if (OptForSize \|\| Subtarget->isAtom())
	+ return false;
	+
	+ // If the branch is predicted right, an out of order CPU can avoid blocking on
	+ // the compare. Emit cmovs on compares with a memory operand as branches to
	+ // avoid stalls on the load from memory. If the compare has more than one use
	+ // there's probably another cmov or setcc around so it's not worth emitting a
	+ // branch.
	+ SDValue Cmp = N->getOperand(3);
	+ if (Cmp->getOpcode() == X86ISD::CMP && Cmp->hasOneUse()) {
	+ SDValue Cmp0 = Cmp->getOperand(0);
	+ SDValue Cmp1 = Cmp->getOperand(1);
	+
	+ // We check that the memory operand has one use to avoid uses of the loaded
	+ // value directly after the compare, making branches unprofitable.
	+ return ((isa<LoadSDNode>(Cmp0) && Cmp0->hasOneUse()) \|\|
	+ (isa<LoadSDNode>(Cmp1) && Cmp1->hasOneUse()));
	+ }
	+
	+ return false;
	+}]>;
	+
	// X86 doesn't have 8-bit conditional moves. Use a customInserter to
	// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
	// however that requires promoting the operands, and can induce additional
	@@ -465,6 +494,27 @@ def CMOV_RFP80 : I<0, Pseudo,
	(X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
	EFLAGS))]>;
	} // Predicates = [NoCMov]
	+
	+// Patterns for cmovs that are better emitted as a branch. See expensive_cmov
	+// above for the conditions.
	+let AddedComplexity = 2 in { // Override patterns below.
	+def CMOV_GR32_EXP : I<0, Pseudo,
	+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
	+ "#CMOV_GR32_EXP* PSEUDO!",
	+ [(set GR32:$dst,
	+ (expensive_cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
	+def CMOV_GR16_EXP : I<0, Pseudo,
	+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
	+ "#CMOV_GR16_EXP* PSEUDO!",
	+ [(set GR16:$dst,
	+ (expensive_cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
	+def CMOV_GR64_EXP : I<0, Pseudo,
	+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$cond),
	+ "#CMOV_GR64_EXP* PSEUDO!",
	+ [(set GR64:$dst,
	+ (expensive_cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>;
	+} // AddedComplexity = 2
	+
	} // UsesCustomInserter = 1, Uses = [EFLAGS]


	diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
	index 266fd7b..37f74bd 100644
	--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
	+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
	@@ -10,7 +10,7 @@

	%struct.indexentry = type { i32, i8, i8, i8, i8, i8* }

	-define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind {
	+define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind optsize {
	entry:
	%tmp910 = add i32 0, %offset ; <i32> [#uses=1]
	br i1 true, label %bb951, label %bb917
	diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
	new file mode 100644
	index 0000000..780746a
	--- /dev/null
	+++ b/test/CodeGen/X86/cmov-into-branch.ll
	@@ -0,0 +1,63 @@
	+; RUN: llc -march=x86-64 -mcpu=core2 < %s \| FileCheck %s
	+
	+; cmp with single-use load, should not form cmov.
	+define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
	+ %load = load double* %b, align 8
	+ %cmp = fcmp olt double %load, %a
	+ %cond = select i1 %cmp, i32 %x, i32 %y
	+ ret i32 %cond
	+; CHECK: test1:
	+; CHECK: ucomisd
	+; CHECK-NOT: cmov
	+; CHECK: j
	+; CHECK-NOT: cmov
	+}
	+
	+; Sanity check: no load.
	+define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
	+ %cmp = fcmp ogt double %a, %b
	+ %cond = select i1 %cmp, i32 %x, i32 %y
	+ ret i32 %cond
	+; CHECK: test2:
	+; CHECK: ucomisd
	+; CHECK: cmov
	+}
	+
	+; Multiple uses of %a, should not form cmov.
	+define i32 @test3(i32 %a, i32* nocapture %b, i32 %x) {
	+ %load = load i32* %b, align 4
	+ %cmp = icmp ult i32 %load, %a
	+ %cond = select i1 %cmp, i32 %a, i32 %x
	+ ret i32 %cond
	+; CHECK: test3:
	+; CHECK: cmpl
	+; CHECK-NOT: cmov
	+; CHECK: j
	+; CHECK-NOT: cmov
	+}
	+
	+; Multiple uses of the load.
	+define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
	+ %load = load i32* %b, align 4
	+ %cmp = icmp ult i32 %load, %a
	+ %cond = select i1 %cmp, i32 %x, i32 %y
	+ %add = add i32 %cond, %load
	+ ret i32 %add
	+; CHECK: test4:
	+; CHECK: cmpl
	+; CHECK: cmov
	+}
	+
	+; Multiple uses of the cmp.
	+define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
	+ %load = load i32* %b, align 4
	+ %cmp = icmp ult i32 %load, %a
	+ %cmp1 = icmp ugt i32 %load, %a
	+ %cond = select i1 %cmp1, i32 %a, i32 %y
	+ %cond5 = select i1 %cmp, i32 %cond, i32 %x
	+ ret i32 %cond5
	+; CHECK: test5:
	+; CHECK: cmpl
	+; CHECK: cmov
	+; CHECK: cmov
	+}