Skip to content

Instantly share code, notes, and snippets.

@d0k
Created April 24, 2012 18:39
Show Gist options
  • Save d0k/2482482 to your computer and use it in GitHub Desktop.
Save d0k/2482482 to your computer and use it in GitHub Desktop.
commit 99ea7d9b3d64c51694b8cc35c6bd25b4945ab0c2
Author: Benjamin Kramer <[email protected]>
Date: Wed Apr 25 12:39:50 2012 +0200
X86: Turn cmovs into branches when profitable.
This came up when a change in block placement formed a cmov and slowed down a
hot loop by 50%:
ucomisd (%rdi), %xmm0
cmovbel %edx, %esi
cmov is a really bad choice in this context because it doesn't get branch
prediction. If we emit it as a branch, an out-of-order CPU can do a better job
(if the branch is predicted right) and avoid waiting for the slow load+compare
instruction to finish. Of course it won't help if the branch is unpredictable,
but those are really rare in practice.
As a heuristic turn all cmovs that have one use and a direct memory operand into
branches. cmovs are usually save some code size, so disable the transform in -Os
mode and on atom, which is an in-order microarchitecture and unlikely to benefit
from this.
Test suite shows a 9% improvement on CoyoteBench/huffbench. I'm not aware of any
significant execution time regressions. The results depend a lot on the used
microarchitecture so YMMV.
There is probably still more to get out of this, as LLVM really likes setcc and
cmov, sometimes forming long chains of them which are deadly for modern CPUs.
Thanks to Chandler Carruth for the initial test case and providing me with
stable test-suite numbers :)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0f99844..628de51 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12483,6 +12483,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_GR16_EXP:
+ case X86::CMOV_GR32_EXP:
+ case X86::CMOV_GR64_EXP:
return EmitLoweredSelect(MI, BB);
case X86::FP32_TO_INT16_IN_MEM:
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 6f9e849..bdcb063 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -421,6 +421,35 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
//===----------------------------------------------------------------------===//
// Conditional Move Pseudo Instructions
+// Fragment to identify cmovs that are better emitted as an explicit branch.
+def expensive_cmov : PatFrag<(ops node:$src1, node:$src2, node:$cond,
+ node:$flags),
+ (X86cmov node:$src1, node:$src2, node:$cond,
+ node:$flags),[{
+ // cmov is usually denser than branches. Atom is an in-order cpu so branches
+ // can't give a big improvement there.
+ if (OptForSize || Subtarget->isAtom())
+ return false;
+
+ // If the branch is predicted right, an out of order CPU can avoid blocking on
+ // the compare. Emit cmovs on compares with a memory operand as branches to
+ // avoid stalls on the load from memory. If the compare has more than one use
+ // there's probably another cmov or setcc around so it's not worth emitting a
+ // branch.
+ SDValue Cmp = N->getOperand(3);
+ if (Cmp->getOpcode() == X86ISD::CMP && Cmp->hasOneUse()) {
+ SDValue Cmp0 = Cmp->getOperand(0);
+ SDValue Cmp1 = Cmp->getOperand(1);
+
+ // We check that the memory operand has one use to avoid uses of the loaded
+ // value directly after the compare, making branches unprofitable.
+ return ((isa<LoadSDNode>(Cmp0) && Cmp0->hasOneUse()) ||
+ (isa<LoadSDNode>(Cmp1) && Cmp1->hasOneUse()));
+ }
+
+ return false;
+}]>;
+
// X86 doesn't have 8-bit conditional moves. Use a customInserter to
// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
// however that requires promoting the operands, and can induce additional
@@ -465,6 +494,27 @@ def CMOV_RFP80 : I<0, Pseudo,
(X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
EFLAGS))]>;
} // Predicates = [NoCMov]
+
+// Patterns for cmovs that are better emitted as a branch. See expensive_cmov
+// above for the conditions.
+let AddedComplexity = 2 in { // Override patterns below.
+def CMOV_GR32_EXP : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
+ "#CMOV_GR32_EXP* PSEUDO!",
+ [(set GR32:$dst,
+ (expensive_cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR16_EXP : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
+ "#CMOV_GR16_EXP* PSEUDO!",
+ [(set GR16:$dst,
+ (expensive_cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR64_EXP : I<0, Pseudo,
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$cond),
+ "#CMOV_GR64_EXP* PSEUDO!",
+ [(set GR64:$dst,
+ (expensive_cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>;
+} // AddedComplexity = 2
+
} // UsesCustomInserter = 1, Uses = [EFLAGS]
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 266fd7b..37f74bd 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -10,7 +10,7 @@
%struct.indexentry = type { i32, i8*, i8*, i8*, i8*, i8* }
-define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind {
+define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind optsize {
entry:
%tmp910 = add i32 0, %offset ; <i32> [#uses=1]
br i1 true, label %bb951, label %bb917
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
new file mode 100644
index 0000000..780746a
--- /dev/null
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+
+; cmp with single-use load, should not form cmov.
+define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
+ %load = load double* %b, align 8
+ %cmp = fcmp olt double %load, %a
+ %cond = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %cond
+; CHECK: test1:
+; CHECK: ucomisd
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Sanity check: no load.
+define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
+ %cmp = fcmp ogt double %a, %b
+ %cond = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %cond
+; CHECK: test2:
+; CHECK: ucomisd
+; CHECK: cmov
+}
+
+; Multiple uses of %a, should not form cmov.
+define i32 @test3(i32 %a, i32* nocapture %b, i32 %x) {
+ %load = load i32* %b, align 4
+ %cmp = icmp ult i32 %load, %a
+ %cond = select i1 %cmp, i32 %a, i32 %x
+ ret i32 %cond
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Multiple uses of the load.
+define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+ %load = load i32* %b, align 4
+ %cmp = icmp ult i32 %load, %a
+ %cond = select i1 %cmp, i32 %x, i32 %y
+ %add = add i32 %cond, %load
+ ret i32 %add
+; CHECK: test4:
+; CHECK: cmpl
+; CHECK: cmov
+}
+
+; Multiple uses of the cmp.
+define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+ %load = load i32* %b, align 4
+ %cmp = icmp ult i32 %load, %a
+ %cmp1 = icmp ugt i32 %load, %a
+ %cond = select i1 %cmp1, i32 %a, i32 %y
+ %cond5 = select i1 %cmp, i32 %cond, i32 %x
+ ret i32 %cond5
+; CHECK: test5:
+; CHECK: cmpl
+; CHECK: cmov
+; CHECK: cmov
+}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment