Skip to content

Instantly share code, notes, and snippets.

@johnbartholomew
Created April 4, 2012 20:48
Show Gist options
  • Save johnbartholomew/2305480 to your computer and use it in GitHub Desktop.
Save johnbartholomew/2305480 to your computer and use it in GitHub Desktop.
compiler smarts: recognising shifts and ors as a load or byte-swap
#include <string.h>
#include <stdint.h>
/* compiled with:
* {gcc,clang} -std=c89 -pedantic -Wall -Wextra -O3 -S -c byte-order-test.c
*
* Built on and for an x86-64 system.
* The full assembly output from both compilers is embedded at the bottom.
*
* Summary:
* - Clang gets it right (100%), for some code structures, but not others,
* recognising both a straight load, and a 32-bit endian-swap.
* - GCC 4.7.0 gets it partially right, for some code structures but not others,
* recognising a straight load, but not a 32-bit endian-swap
* (or conceivably recognising an endian-swap but deciding that the bswap
* instruction is slower than explicitly coded movs and logic ops;
* I don't know whether that's true for typical x86-64 architectures)
* However, it doesn't always remove the loop, even in a couple of cases
* where it eliminates the memory -> register -> memory step.
*/
/* ---- A versions ---- */
/* clang 3: recognises this as a load
* gcc 4.7: recognises this as a load */
uint32_t fromLE32_A(uint32_t word) {
uint8_t x[4];
memcpy(x, &word, 4);
return
((uint32_t)(x[0]) << 0) |
((uint32_t)(x[1]) << 8) |
((uint32_t)(x[2]) << 16) |
((uint32_t)(x[3]) << 24);
}
/* clang 3: uses the bswap instruction
* gcc 4.7: uses movs, shifts and ors */
uint32_t fromBE32_A(uint32_t word) {
uint8_t x[4];
memcpy(x, &word, 4);
return
((uint32_t)(x[0]) << 24) |
((uint32_t)(x[1]) << 16) |
((uint32_t)(x[2]) << 8) |
((uint32_t)(x[3]) << 0);
}
/* clang 3: turns this into an empty function (just a ret)
* gcc 4.7: removes the loop body but keeps the loop (!) */
void words_fromLE32_A(uint32_t *words, size_t n) {
size_t i;
for (i = 0; i < n; ++i) {
words[i] = fromLE32_A(words[i]);
}
}
/* clang 3: turns this into a bswap loop
* gcc 4.7: loop containing movs, shifts and ors */
void words_fromBE32_A(uint32_t *words, size_t n) {
size_t i;
for (i = 0; i < n; ++i) {
words[i] = fromBE32_A(words[i]);
}
}
/* ---- B versions ---- */
/* clang 3: uses movs, shifts and ors
* gcc 4.7: uses movs, shifts and ors */
uint32_t fromLE32_B(uint8_t *x) {
return
((uint32_t)(x[0]) << 0) |
((uint32_t)(x[1]) << 8) |
((uint32_t)(x[2]) << 16) |
((uint32_t)(x[3]) << 24);
}
/* clang 3: uses movs, shifts and ors
* gcc 4.7: uses movs, shifts and ors */
uint32_t fromBE32_B(uint8_t *x) {
return
((uint32_t)(x[0]) << 24) |
((uint32_t)(x[1]) << 16) |
((uint32_t)(x[2]) << 8) |
((uint32_t)(x[3]) << 0);
}
/* clang 3: empty function (just a ret)
* gcc 4.7: removes the loop body, but keeps the loop (!) */
void words_fromLE32_B(uint32_t *words, size_t n) {
uint8_t bytes[4];
size_t i;
for (i = 0; i < n; ++i) {
memcpy(bytes, &words[i], 4);
words[i] = fromLE32_B(bytes);
}
}
/* clang 3: turns this into a bswap loop
* gcc 4.7: loop containing movs, shifts and ors */
void words_fromBE32_B(uint32_t *words, size_t n) {
uint8_t bytes[4];
size_t i;
for (i = 0; i < n; ++i) {
memcpy(bytes, &words[i], 4);
words[i] = fromBE32_B(bytes);
}
}
/* ----------------------------------------------------- */
/* clang 3.0 output */
#if 0
.file "byte-order-test.c"
.text
.globl fromLE32_A
.align 16, 0x90
.type fromLE32_A,@function
fromLE32_A: # @fromLE32_A
.Ltmp0:
.cfi_startproc
# BB#0:
movl %edi, %eax
ret
.Ltmp1:
.size fromLE32_A, .Ltmp1-fromLE32_A
.Ltmp2:
.cfi_endproc
.Leh_func_end0:
.globl fromBE32_A
.align 16, 0x90
.type fromBE32_A,@function
fromBE32_A: # @fromBE32_A
.Ltmp3:
.cfi_startproc
# BB#0:
bswapl %edi
movl %edi, %eax
ret
.Ltmp4:
.size fromBE32_A, .Ltmp4-fromBE32_A
.Ltmp5:
.cfi_endproc
.Leh_func_end1:
.globl words_fromLE32_A
.align 16, 0x90
.type words_fromLE32_A,@function
words_fromLE32_A: # @words_fromLE32_A
.Ltmp6:
.cfi_startproc
# BB#0: # %.lr.ph
ret
.Ltmp7:
.size words_fromLE32_A, .Ltmp7-words_fromLE32_A
.Ltmp8:
.cfi_endproc
.Leh_func_end2:
.globl words_fromBE32_A
.align 16, 0x90
.type words_fromBE32_A,@function
words_fromBE32_A: # @words_fromBE32_A
.Ltmp9:
.cfi_startproc
# BB#0:
testq %rsi, %rsi
je .LBB3_2
.align 16, 0x90
.LBB3_1: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movl (%rdi), %eax
bswapl %eax
movl %eax, (%rdi)
addq $4, %rdi
decq %rsi
jne .LBB3_1
.LBB3_2: # %._crit_edge
ret
.Ltmp10:
.size words_fromBE32_A, .Ltmp10-words_fromBE32_A
.Ltmp11:
.cfi_endproc
.Leh_func_end3:
.globl fromLE32_B
.align 16, 0x90
.type fromLE32_B,@function
fromLE32_B: # @fromLE32_B
.Ltmp12:
.cfi_startproc
# BB#0:
movzbl (%rdi), %ecx
movzbl 1(%rdi), %eax
shll $8, %eax
orl %ecx, %eax
movzbl 2(%rdi), %ecx
shll $16, %ecx
orl %eax, %ecx
movzbl 3(%rdi), %eax
shll $24, %eax
orl %ecx, %eax
ret
.Ltmp13:
.size fromLE32_B, .Ltmp13-fromLE32_B
.Ltmp14:
.cfi_endproc
.Leh_func_end4:
.globl fromBE32_B
.align 16, 0x90
.type fromBE32_B,@function
fromBE32_B: # @fromBE32_B
.Ltmp15:
.cfi_startproc
# BB#0:
movzbl (%rdi), %ecx
shll $24, %ecx
movzbl 1(%rdi), %eax
shll $16, %eax
orl %ecx, %eax
movzbl 3(%rdi), %ecx
orl %eax, %ecx
movzbl 2(%rdi), %eax
shll $8, %eax
orl %ecx, %eax
ret
.Ltmp16:
.size fromBE32_B, .Ltmp16-fromBE32_B
.Ltmp17:
.cfi_endproc
.Leh_func_end5:
.globl words_fromLE32_B
.align 16, 0x90
.type words_fromLE32_B,@function
words_fromLE32_B: # @words_fromLE32_B
.Ltmp18:
.cfi_startproc
# BB#0: # %.lr.ph
ret
.Ltmp19:
.size words_fromLE32_B, .Ltmp19-words_fromLE32_B
.Ltmp20:
.cfi_endproc
.Leh_func_end6:
.globl words_fromBE32_B
.align 16, 0x90
.type words_fromBE32_B,@function
words_fromBE32_B: # @words_fromBE32_B
.Ltmp21:
.cfi_startproc
# BB#0:
testq %rsi, %rsi
je .LBB7_2
.align 16, 0x90
.LBB7_1: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movl (%rdi), %eax
bswapl %eax
movl %eax, (%rdi)
addq $4, %rdi
decq %rsi
jne .LBB7_1
.LBB7_2: # %._crit_edge
ret
.Ltmp22:
.size words_fromBE32_B, .Ltmp22-words_fromBE32_B
.Ltmp23:
.cfi_endproc
.Leh_func_end7:
.section ".note.GNU-stack","",@progbits
#endif
/* gcc 4.7.0 output */
#if 0
.file "byte-order-test.c"
.text
.p2align 4,,15
.globl fromLE32_A
.type fromLE32_A, @function
fromLE32_A:
.LFB12:
.cfi_startproc
movl %edi, %eax
ret
.cfi_endproc
.LFE12:
.size fromLE32_A, .-fromLE32_A
.p2align 4,,15
.globl fromBE32_A
.type fromBE32_A, @function
fromBE32_A:
.LFB13:
.cfi_startproc
movq %rdi, %rcx
movl %edi, %eax
movzbl %ch, %edx
sall $24, %eax
sall $16, %edx
orl %edx, %eax
movl %edi, %edx
shrl $8, %edi
shrl $24, %edx
andl $65280, %edi
orl %edx, %eax
orl %edi, %eax
ret
.cfi_endproc
.LFE13:
.size fromBE32_A, .-fromBE32_A
.p2align 4,,15
.globl words_fromLE32_A
.type words_fromLE32_A, @function
words_fromLE32_A:
.LFB14:
.cfi_startproc
testq %rsi, %rsi
leaq (%rdi,%rsi,4), %rax
je .L3
.p2align 4,,10
.p2align 3
.L7:
addq $4, %rdi
cmpq %rax, %rdi
jne .L7
.L3:
rep
ret
.cfi_endproc
.LFE14:
.size words_fromLE32_A, .-words_fromLE32_A
.p2align 4,,15
.globl words_fromBE32_A
.type words_fromBE32_A, @function
words_fromBE32_A:
.LFB15:
.cfi_startproc
testq %rsi, %rsi
leaq (%rdi,%rsi,4), %r8
je .L10
.p2align 4,,10
.p2align 3
.L14:
movl (%rdi), %edx
movl %edx, %eax
movzbl %dh, %ecx
sall $16, %ecx
sall $24, %eax
orl %ecx, %eax
movl %edx, %ecx
shrl $8, %edx
shrl $24, %ecx
andl $65280, %edx
orl %ecx, %eax
orl %edx, %eax
movl %eax, (%rdi)
addq $4, %rdi
cmpq %r8, %rdi
jne .L14
.L10:
rep
ret
.cfi_endproc
.LFE15:
.size words_fromBE32_A, .-words_fromBE32_A
.p2align 4,,15
.globl fromLE32_B
.type fromLE32_B, @function
fromLE32_B:
.LFB16:
.cfi_startproc
movzbl 1(%rdi), %eax
movzbl 2(%rdi), %edx
sall $8, %eax
sall $16, %edx
orl %edx, %eax
movzbl (%rdi), %edx
orl %edx, %eax
movzbl 3(%rdi), %edx
sall $24, %edx
orl %edx, %eax
ret
.cfi_endproc
.LFE16:
.size fromLE32_B, .-fromLE32_B
.p2align 4,,15
.globl fromBE32_B
.type fromBE32_B, @function
fromBE32_B:
.LFB17:
.cfi_startproc
movzbl (%rdi), %eax
movzbl 1(%rdi), %edx
sall $24, %eax
sall $16, %edx
orl %edx, %eax
movzbl 3(%rdi), %edx
orl %edx, %eax
movzbl 2(%rdi), %edx
sall $8, %edx
orl %edx, %eax
ret
.cfi_endproc
.LFE17:
.size fromBE32_B, .-fromBE32_B
.p2align 4,,15
.globl words_fromLE32_B
.type words_fromLE32_B, @function
words_fromLE32_B:
.LFB18:
.cfi_startproc
xorl %eax, %eax
testq %rsi, %rsi
je .L18
.p2align 4,,10
.p2align 3
.L20:
addq $1, %rax
cmpq %rsi, %rax
jne .L20
.L18:
rep
ret
.cfi_endproc
.LFE18:
.size words_fromLE32_B, .-words_fromLE32_B
.p2align 4,,15
.globl words_fromBE32_B
.type words_fromBE32_B, @function
words_fromBE32_B:
.LFB19:
.cfi_startproc
xorl %r8d, %r8d
testq %rsi, %rsi
je .L23
.p2align 4,,10
.p2align 3
.L25:
movl (%rdi), %edx
addq $1, %r8
movl %edx, %eax
movzbl %dh, %ecx
sall $16, %ecx
sall $24, %eax
orl %ecx, %eax
movl %edx, %ecx
shrl $8, %edx
shrl $24, %ecx
andl $65280, %edx
orl %ecx, %eax
orl %edx, %eax
movl %eax, (%rdi)
addq $4, %rdi
cmpq %rsi, %r8
jne .L25
.L23:
rep
ret
.cfi_endproc
.LFE19:
.size words_fromBE32_B, .-words_fromBE32_B
.ident "GCC: (GNU) 4.7.0 20120324 (prerelease)"
.section .note.GNU-stack,"",@progbits
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment