A function clone may prevent GCC from inlining a callee into the caller.
I noticed this when testing target_clones features for performance tuning.
Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target.
A example is shown below:
git clone https://github.com/eembc/coremark.git
cd coremark
git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard
cat > crcu32_1.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc)
return crc;
}
ee_u16
+crcu32_2(ee_u32 newval, ee_u16 crc)
+{
+ crc = crc16((ee_s16)newval, crc);
+ crc = crc16((ee_s16)(newval >> 16), crc);
+ return crc;
+}
+ee_u16
crc16(ee_s16 newval, ee_u16 crc)
{
return crcu16((ee_u16)newval, crc);
EOF
patch -p1 < crcu32_1.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
Then, look at the generated core_util.s
file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8.
crcu32:
.LFB15:
.cfi_startproc
movl %esi, %eax
movl %edi, %ecx
movl $8, %esi
.p2align 5
.p2align 4
.p2align 3
.L33:
movl %edi, %edx
shrb %dil
xorl %eax, %edx
shrw %ax
andl $1, %edx
negl %edx
andw $-24575, %dx
xorl %edx, %eax
subb $1, %sil
jne .L33
movzbl %ch, %edi
movl $8, %esi
However, when you copy the function with some parameters changed, for instance:
cat > crcu32_2.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -204,7 +204,7 @@ ee_u16
crcu32_2(ee_u32 newval, ee_u16 crc)
{
crc = crc16((ee_s16)newval, crc);
- crc = crc16((ee_s16)(newval >> 16), crc);
+ crc = crc16((ee_s16)(newval >> 15), crc);
return crc;
}
ee_u16
EOF
patch -p1 < crcu32_2.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
You will notice the crcu32
and crcu32_2
function being generated as machine code without inline.
crcu32:
.LFB15:
.cfi_startproc
movl %edi, %r8d
movzwl %si, %esi
movzwl %di, %edi
call crcu16
movl %r8d, %edi
movzwl %ax, %esi
shrl $16, %edi
jmp crcu16
.cfi_endproc
However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not.
cat > crcu16_1.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc)
return crc;
}
ee_u16
+crcu8_2(ee_u8 data, ee_u16 crc)
+{
+ ee_u8 i = 0, x16 = 0, carry = 0;
+
+ for (i = 0; i < 8; i++)
+ {
+ x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+ data >>= 1;
+
+ if (x16 == 1)
+ {
+ crc ^= 0x4002;
+ carry = 1;
+ }
+ else
+ carry = 0;
+ crc >>= 1;
+ if (carry)
+ crc |= 0x8000;
+ else
+ crc &= 0x7fff;
+ }
+ return crc;
+}
+ee_u16
crcu16(ee_u16 newval, ee_u16 crc)
{
crc = crcu8((ee_u8)(newval), crc);
@@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc)
return crc;
}
ee_u16
+crcu16_2(ee_u16 newval, ee_u16 crc)
+{
+ crc = crcu8_2((ee_u8)(newval), crc);
+ crc = crcu8_2((ee_u8)((newval) >> 8), crc);
+ return crc;
+}
+ee_u16
crcu32(ee_u32 newval, ee_u16 crc)
{
crc = crc16((ee_s16)newval, crc);
@@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
return crc;
}
ee_u16
+crc16_2(ee_s16 newval, ee_u16 crc)
+{
+ return crcu16_2((ee_u16)newval, crc);
+}
+ee_u16
crcu32_2(ee_u32 newval, ee_u16 crc)
{
- crc = crc16((ee_s16)newval, crc);
- crc = crc16((ee_s16)(newval >> 15), crc);
+ crc = crc16_2((ee_s16)newval, crc);
+ crc = crc16_2((ee_s16)(newval >> 15), crc);
return crc;
}
ee_u16
EOF
patch -p1 < crcu16_1.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
In this case, both functions returned to be inlined.
crcu32:
.LFB17:
.cfi_startproc
movl %edi, %edx
movl $8, %ecx
.p2align 5
.p2align 4
.p2align 3
.L41:
movl %edi, %eax
shrb %dil
xorl %esi, %eax
shrw %si
andl $1, %eax
negl %eax
andw $-24575, %ax
xorl %eax, %esi
subb $1, %cl
jne .L41
....
movzwl %si, %esi
movzwl %dx, %edi
jmp crcu16
.cfi_endproc
.LFE19:
.size crcu32_2, .-crcu32_2
.p2align 4
.globl crc16
.type crc16, @function
....
crcu32_2:
.LFB19:
.cfi_startproc
movl %edi, %edx
movl $8, %ecx
.p2align 5
.p2align 4
.p2align 3
.L51:
movl %edi, %eax
shrb %dil
xorl %esi, %eax
shrw %si
andl $1, %eax
negl %eax
andw $-24575, %ax
xorl %eax, %esi
subb $1, %cl
jne .L51
movzbl %dh, %edi
movl $8, %ecx
....
subb $1, %cl
jne .L52
shrl $15, %edx
movzwl %si, %esi
movzwl %dx, %edi
jmp crcu16
.cfi_endproc
.LFE19:
.size crcu32_2, .-crcu32_2
.p2align 4
.globl crc16
.type crc16, @function
Why?