gcc-inline-copy-function.md

A function clone may prevent GCC from inlining a callee into the caller.

I noticed this when testing target_clones features for performance tuning.

Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target.

A example is shown below:

git clone https://github.com/eembc/coremark.git
cd coremark
git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard
cat > crcu32_1.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc)
     return crc;
 }
 ee_u16
+crcu32_2(ee_u32 newval, ee_u16 crc)
+{
+    crc = crc16((ee_s16)newval, crc);
+    crc = crc16((ee_s16)(newval >> 16), crc);
+    return crc;
+}
+ee_u16
 crc16(ee_s16 newval, ee_u16 crc)
 {
     return crcu16((ee_u16)newval, crc);
EOF
patch -p1 < crcu32_1.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S

Then, look at the generated core_util.s file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8.

crcu32:
.LFB15:
	.cfi_startproc
	movl	%esi, %eax
	movl	%edi, %ecx
	movl	$8, %esi
	.p2align 5
	.p2align 4
	.p2align 3
.L33:
	movl	%edi, %edx
	shrb	%dil
	xorl	%eax, %edx
	shrw	%ax
	andl	$1, %edx
	negl	%edx
	andw	$-24575, %dx
	xorl	%edx, %eax
	subb	$1, %sil
	jne	.L33
	movzbl	%ch, %edi
	movl	$8, %esi

However, when you copy the function with some parameters changed, for instance:

cat > crcu32_2.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -204,7 +204,7 @@ ee_u16
 crcu32_2(ee_u32 newval, ee_u16 crc)
 {
     crc = crc16((ee_s16)newval, crc);
-    crc = crc16((ee_s16)(newval >> 16), crc);
+    crc = crc16((ee_s16)(newval >> 15), crc);
     return crc;
 }
 ee_u16
EOF
patch -p1 < crcu32_2.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S

You will notice the crcu32 and crcu32_2 function being generated as machine code without inline.

crcu32:
.LFB15:
	.cfi_startproc
	movl	%edi, %r8d
	movzwl	%si, %esi
	movzwl	%di, %edi
	call	crcu16
	movl	%r8d, %edi
	movzwl	%ax, %esi
	shrl	$16, %edi
	jmp	crcu16
	.cfi_endproc

However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not.

cat > crcu16_1.patch << EOF
diff a/core_util.c b/core_util.c
--- a/core_util.c
+++ b/core_util.c
@@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc)
     return crc;
 }
 ee_u16
+crcu8_2(ee_u8 data, ee_u16 crc)
+{
+    ee_u8 i = 0, x16 = 0, carry = 0;
+
+    for (i = 0; i < 8; i++)
+    {
+        x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+        data >>= 1;
+
+        if (x16 == 1)
+        {
+            crc ^= 0x4002;
+            carry = 1;
+        }
+        else
+            carry = 0;
+        crc >>= 1;
+        if (carry)
+            crc |= 0x8000;
+        else
+            crc &= 0x7fff;
+    }
+    return crc;
+}
+ee_u16
 crcu16(ee_u16 newval, ee_u16 crc)
 {
     crc = crcu8((ee_u8)(newval), crc);
@@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc)
     return crc;
 }
 ee_u16
+crcu16_2(ee_u16 newval, ee_u16 crc)
+{
+    crc = crcu8_2((ee_u8)(newval), crc);
+    crc = crcu8_2((ee_u8)((newval) >> 8), crc);
+    return crc;
+}
+ee_u16
 crcu32(ee_u32 newval, ee_u16 crc)
 {
     crc = crc16((ee_s16)newval, crc);
@@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
     return crc;
 }
 ee_u16
+crc16_2(ee_s16 newval, ee_u16 crc)
+{
+    return crcu16_2((ee_u16)newval, crc);
+}
+ee_u16
 crcu32_2(ee_u32 newval, ee_u16 crc)
 {
-    crc = crc16((ee_s16)newval, crc);
-    crc = crc16((ee_s16)(newval >> 15), crc);
+    crc = crc16_2((ee_s16)newval, crc);
+    crc = crc16_2((ee_s16)(newval >> 15), crc);
     return crc;
 }
 ee_u16
EOF
patch -p1 < crcu16_1.patch
gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S

In this case, both functions returned to be inlined.

crcu32:
.LFB17:
	.cfi_startproc
	movl	%edi, %edx
	movl	$8, %ecx
	.p2align 5
	.p2align 4
	.p2align 3
.L41:
	movl	%edi, %eax
	shrb	%dil
	xorl	%esi, %eax
	shrw	%si
	andl	$1, %eax
	negl	%eax
	andw	$-24575, %ax
	xorl	%eax, %esi
	subb	$1, %cl
	jne	.L41
....
	movzwl	%si, %esi
	movzwl	%dx, %edi
	jmp	crcu16
	.cfi_endproc
.LFE19:
	.size	crcu32_2, .-crcu32_2
	.p2align 4
	.globl	crc16
	.type	crc16, @function

....

crcu32_2:
.LFB19:
	.cfi_startproc
	movl	%edi, %edx
	movl	$8, %ecx
	.p2align 5
	.p2align 4
	.p2align 3
.L51:
	movl	%edi, %eax
	shrb	%dil
	xorl	%esi, %eax
	shrw	%si
	andl	$1, %eax
	negl	%eax
	andw	$-24575, %ax
	xorl	%eax, %esi
	subb	$1, %cl
	jne	.L51
	movzbl	%dh, %edi
	movl	$8, %ecx

....
	subb	$1, %cl
	jne	.L52
	shrl	$15, %edx
	movzwl	%si, %esi
	movzwl	%dx, %edi
	jmp	crcu16
	.cfi_endproc
.LFE19:
	.size	crcu32_2, .-crcu32_2
	.p2align 4
	.globl	crc16
	.type	crc16, @function

Why?

cyyself/gcc-inline-copy-function.md