Created
April 4, 2012 20:48
-
-
Save johnbartholomew/2305480 to your computer and use it in GitHub Desktop.
compiler smarts: recognising shifts and ors as a load or byte-swap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string.h> | |
#include <stdint.h> | |
/* compiled with: | |
* {gcc,clang} -std=c89 -pedantic -Wall -Wextra -O3 -S -c byte-order-test.c | |
* | |
* Built on and for an x86-64 system. | |
* The full assembly output from both compilers is embedded at the bottom. | |
* | |
* Summary: | |
* - Clang gets it right (100%), for some code structures, but not others, | |
* recognising both a straight load, and a 32-bit endian-swap. | |
* - GCC 4.7.0 gets it partially right, for some code structures but not others, | |
* recognising a straight load, but not a 32-bit endian-swap | |
* (or conceivably recognising an endian-swap but deciding that the bswap | |
* instruction is slower than explicitly coded movs and logic ops; | |
* I don't know whether that's true for typical x86-64 architectures) | |
* However, it doesn't always remove the loop, even in a couple of cases | |
* where it eliminates the memory -> register -> memory step. | |
*/ | |
/* ---- A versions ---- */ | |
/* clang 3: recognises this as a load | |
* gcc 4.7: recognises this as a load */ | |
uint32_t fromLE32_A(uint32_t word) { | |
uint8_t x[4]; | |
memcpy(x, &word, 4); | |
return | |
((uint32_t)(x[0]) << 0) | | |
((uint32_t)(x[1]) << 8) | | |
((uint32_t)(x[2]) << 16) | | |
((uint32_t)(x[3]) << 24); | |
} | |
/* clang 3: uses the bswap instruction | |
* gcc 4.7: uses movs, shifts and ors */ | |
uint32_t fromBE32_A(uint32_t word) { | |
uint8_t x[4]; | |
memcpy(x, &word, 4); | |
return | |
((uint32_t)(x[0]) << 24) | | |
((uint32_t)(x[1]) << 16) | | |
((uint32_t)(x[2]) << 8) | | |
((uint32_t)(x[3]) << 0); | |
} | |
/* clang 3: turns this into an empty function (just a ret) | |
* gcc 4.7: removes the loop body but keeps the loop (!) */ | |
void words_fromLE32_A(uint32_t *words, size_t n) { | |
size_t i; | |
for (i = 0; i < n; ++i) { | |
words[i] = fromLE32_A(words[i]); | |
} | |
} | |
/* clang 3: turns this into a bswap loop | |
* gcc 4.7: loop containing movs, shifts and ors */ | |
void words_fromBE32_A(uint32_t *words, size_t n) { | |
size_t i; | |
for (i = 0; i < n; ++i) { | |
words[i] = fromBE32_A(words[i]); | |
} | |
} | |
/* ---- B versions ---- */ | |
/* clang 3: uses movs, shifts and ors | |
* gcc 4.7: uses movs, shifts and ors */ | |
uint32_t fromLE32_B(uint8_t *x) { | |
return | |
((uint32_t)(x[0]) << 0) | | |
((uint32_t)(x[1]) << 8) | | |
((uint32_t)(x[2]) << 16) | | |
((uint32_t)(x[3]) << 24); | |
} | |
/* clang 3: uses movs, shifts and ors | |
* gcc 4.7: uses movs, shifts and ors */ | |
uint32_t fromBE32_B(uint8_t *x) { | |
return | |
((uint32_t)(x[0]) << 24) | | |
((uint32_t)(x[1]) << 16) | | |
((uint32_t)(x[2]) << 8) | | |
((uint32_t)(x[3]) << 0); | |
} | |
/* clang 3: empty function (just a ret) | |
* gcc 4.7: removes the loop body, but keeps the loop (!) */ | |
void words_fromLE32_B(uint32_t *words, size_t n) { | |
uint8_t bytes[4]; | |
size_t i; | |
for (i = 0; i < n; ++i) { | |
memcpy(bytes, &words[i], 4); | |
words[i] = fromLE32_B(bytes); | |
} | |
} | |
/* clang 3: turns this into a bswap loop | |
* gcc 4.7: loop containing movs, shifts and ors */ | |
void words_fromBE32_B(uint32_t *words, size_t n) { | |
uint8_t bytes[4]; | |
size_t i; | |
for (i = 0; i < n; ++i) { | |
memcpy(bytes, &words[i], 4); | |
words[i] = fromBE32_B(bytes); | |
} | |
} | |
/* ----------------------------------------------------- */ | |
/* clang 3.0 output */ | |
#if 0 | |
.file "byte-order-test.c" | |
.text | |
.globl fromLE32_A | |
.align 16, 0x90 | |
.type fromLE32_A,@function | |
fromLE32_A: # @fromLE32_A | |
.Ltmp0: | |
.cfi_startproc | |
# BB#0: | |
movl %edi, %eax | |
ret | |
.Ltmp1: | |
.size fromLE32_A, .Ltmp1-fromLE32_A | |
.Ltmp2: | |
.cfi_endproc | |
.Leh_func_end0: | |
.globl fromBE32_A | |
.align 16, 0x90 | |
.type fromBE32_A,@function | |
fromBE32_A: # @fromBE32_A | |
.Ltmp3: | |
.cfi_startproc | |
# BB#0: | |
bswapl %edi | |
movl %edi, %eax | |
ret | |
.Ltmp4: | |
.size fromBE32_A, .Ltmp4-fromBE32_A | |
.Ltmp5: | |
.cfi_endproc | |
.Leh_func_end1: | |
.globl words_fromLE32_A | |
.align 16, 0x90 | |
.type words_fromLE32_A,@function | |
words_fromLE32_A: # @words_fromLE32_A | |
.Ltmp6: | |
.cfi_startproc | |
# BB#0: # %.lr.ph | |
ret | |
.Ltmp7: | |
.size words_fromLE32_A, .Ltmp7-words_fromLE32_A | |
.Ltmp8: | |
.cfi_endproc | |
.Leh_func_end2: | |
.globl words_fromBE32_A | |
.align 16, 0x90 | |
.type words_fromBE32_A,@function | |
words_fromBE32_A: # @words_fromBE32_A | |
.Ltmp9: | |
.cfi_startproc | |
# BB#0: | |
testq %rsi, %rsi | |
je .LBB3_2 | |
.align 16, 0x90 | |
.LBB3_1: # %.lr.ph | |
# =>This Inner Loop Header: Depth=1 | |
movl (%rdi), %eax | |
bswapl %eax | |
movl %eax, (%rdi) | |
addq $4, %rdi | |
decq %rsi | |
jne .LBB3_1 | |
.LBB3_2: # %._crit_edge | |
ret | |
.Ltmp10: | |
.size words_fromBE32_A, .Ltmp10-words_fromBE32_A | |
.Ltmp11: | |
.cfi_endproc | |
.Leh_func_end3: | |
.globl fromLE32_B | |
.align 16, 0x90 | |
.type fromLE32_B,@function | |
fromLE32_B: # @fromLE32_B | |
.Ltmp12: | |
.cfi_startproc | |
# BB#0: | |
movzbl (%rdi), %ecx | |
movzbl 1(%rdi), %eax | |
shll $8, %eax | |
orl %ecx, %eax | |
movzbl 2(%rdi), %ecx | |
shll $16, %ecx | |
orl %eax, %ecx | |
movzbl 3(%rdi), %eax | |
shll $24, %eax | |
orl %ecx, %eax | |
ret | |
.Ltmp13: | |
.size fromLE32_B, .Ltmp13-fromLE32_B | |
.Ltmp14: | |
.cfi_endproc | |
.Leh_func_end4: | |
.globl fromBE32_B | |
.align 16, 0x90 | |
.type fromBE32_B,@function | |
fromBE32_B: # @fromBE32_B | |
.Ltmp15: | |
.cfi_startproc | |
# BB#0: | |
movzbl (%rdi), %ecx | |
shll $24, %ecx | |
movzbl 1(%rdi), %eax | |
shll $16, %eax | |
orl %ecx, %eax | |
movzbl 3(%rdi), %ecx | |
orl %eax, %ecx | |
movzbl 2(%rdi), %eax | |
shll $8, %eax | |
orl %ecx, %eax | |
ret | |
.Ltmp16: | |
.size fromBE32_B, .Ltmp16-fromBE32_B | |
.Ltmp17: | |
.cfi_endproc | |
.Leh_func_end5: | |
.globl words_fromLE32_B | |
.align 16, 0x90 | |
.type words_fromLE32_B,@function | |
words_fromLE32_B: # @words_fromLE32_B | |
.Ltmp18: | |
.cfi_startproc | |
# BB#0: # %.lr.ph | |
ret | |
.Ltmp19: | |
.size words_fromLE32_B, .Ltmp19-words_fromLE32_B | |
.Ltmp20: | |
.cfi_endproc | |
.Leh_func_end6: | |
.globl words_fromBE32_B | |
.align 16, 0x90 | |
.type words_fromBE32_B,@function | |
words_fromBE32_B: # @words_fromBE32_B | |
.Ltmp21: | |
.cfi_startproc | |
# BB#0: | |
testq %rsi, %rsi | |
je .LBB7_2 | |
.align 16, 0x90 | |
.LBB7_1: # %.lr.ph | |
# =>This Inner Loop Header: Depth=1 | |
movl (%rdi), %eax | |
bswapl %eax | |
movl %eax, (%rdi) | |
addq $4, %rdi | |
decq %rsi | |
jne .LBB7_1 | |
.LBB7_2: # %._crit_edge | |
ret | |
.Ltmp22: | |
.size words_fromBE32_B, .Ltmp22-words_fromBE32_B | |
.Ltmp23: | |
.cfi_endproc | |
.Leh_func_end7: | |
.section ".note.GNU-stack","",@progbits | |
#endif | |
/* gcc 4.7.0 output */ | |
#if 0 | |
.file "byte-order-test.c" | |
.text | |
.p2align 4,,15 | |
.globl fromLE32_A | |
.type fromLE32_A, @function | |
fromLE32_A: | |
.LFB12: | |
.cfi_startproc | |
movl %edi, %eax | |
ret | |
.cfi_endproc | |
.LFE12: | |
.size fromLE32_A, .-fromLE32_A | |
.p2align 4,,15 | |
.globl fromBE32_A | |
.type fromBE32_A, @function | |
fromBE32_A: | |
.LFB13: | |
.cfi_startproc | |
movq %rdi, %rcx | |
movl %edi, %eax | |
movzbl %ch, %edx | |
sall $24, %eax | |
sall $16, %edx | |
orl %edx, %eax | |
movl %edi, %edx | |
shrl $8, %edi | |
shrl $24, %edx | |
andl $65280, %edi | |
orl %edx, %eax | |
orl %edi, %eax | |
ret | |
.cfi_endproc | |
.LFE13: | |
.size fromBE32_A, .-fromBE32_A | |
.p2align 4,,15 | |
.globl words_fromLE32_A | |
.type words_fromLE32_A, @function | |
words_fromLE32_A: | |
.LFB14: | |
.cfi_startproc | |
testq %rsi, %rsi | |
leaq (%rdi,%rsi,4), %rax | |
je .L3 | |
.p2align 4,,10 | |
.p2align 3 | |
.L7: | |
addq $4, %rdi | |
cmpq %rax, %rdi | |
jne .L7 | |
.L3: | |
rep | |
ret | |
.cfi_endproc | |
.LFE14: | |
.size words_fromLE32_A, .-words_fromLE32_A | |
.p2align 4,,15 | |
.globl words_fromBE32_A | |
.type words_fromBE32_A, @function | |
words_fromBE32_A: | |
.LFB15: | |
.cfi_startproc | |
testq %rsi, %rsi | |
leaq (%rdi,%rsi,4), %r8 | |
je .L10 | |
.p2align 4,,10 | |
.p2align 3 | |
.L14: | |
movl (%rdi), %edx | |
movl %edx, %eax | |
movzbl %dh, %ecx | |
sall $16, %ecx | |
sall $24, %eax | |
orl %ecx, %eax | |
movl %edx, %ecx | |
shrl $8, %edx | |
shrl $24, %ecx | |
andl $65280, %edx | |
orl %ecx, %eax | |
orl %edx, %eax | |
movl %eax, (%rdi) | |
addq $4, %rdi | |
cmpq %r8, %rdi | |
jne .L14 | |
.L10: | |
rep | |
ret | |
.cfi_endproc | |
.LFE15: | |
.size words_fromBE32_A, .-words_fromBE32_A | |
.p2align 4,,15 | |
.globl fromLE32_B | |
.type fromLE32_B, @function | |
fromLE32_B: | |
.LFB16: | |
.cfi_startproc | |
movzbl 1(%rdi), %eax | |
movzbl 2(%rdi), %edx | |
sall $8, %eax | |
sall $16, %edx | |
orl %edx, %eax | |
movzbl (%rdi), %edx | |
orl %edx, %eax | |
movzbl 3(%rdi), %edx | |
sall $24, %edx | |
orl %edx, %eax | |
ret | |
.cfi_endproc | |
.LFE16: | |
.size fromLE32_B, .-fromLE32_B | |
.p2align 4,,15 | |
.globl fromBE32_B | |
.type fromBE32_B, @function | |
fromBE32_B: | |
.LFB17: | |
.cfi_startproc | |
movzbl (%rdi), %eax | |
movzbl 1(%rdi), %edx | |
sall $24, %eax | |
sall $16, %edx | |
orl %edx, %eax | |
movzbl 3(%rdi), %edx | |
orl %edx, %eax | |
movzbl 2(%rdi), %edx | |
sall $8, %edx | |
orl %edx, %eax | |
ret | |
.cfi_endproc | |
.LFE17: | |
.size fromBE32_B, .-fromBE32_B | |
.p2align 4,,15 | |
.globl words_fromLE32_B | |
.type words_fromLE32_B, @function | |
words_fromLE32_B: | |
.LFB18: | |
.cfi_startproc | |
xorl %eax, %eax | |
testq %rsi, %rsi | |
je .L18 | |
.p2align 4,,10 | |
.p2align 3 | |
.L20: | |
addq $1, %rax | |
cmpq %rsi, %rax | |
jne .L20 | |
.L18: | |
rep | |
ret | |
.cfi_endproc | |
.LFE18: | |
.size words_fromLE32_B, .-words_fromLE32_B | |
.p2align 4,,15 | |
.globl words_fromBE32_B | |
.type words_fromBE32_B, @function | |
words_fromBE32_B: | |
.LFB19: | |
.cfi_startproc | |
xorl %r8d, %r8d | |
testq %rsi, %rsi | |
je .L23 | |
.p2align 4,,10 | |
.p2align 3 | |
.L25: | |
movl (%rdi), %edx | |
addq $1, %r8 | |
movl %edx, %eax | |
movzbl %dh, %ecx | |
sall $16, %ecx | |
sall $24, %eax | |
orl %ecx, %eax | |
movl %edx, %ecx | |
shrl $8, %edx | |
shrl $24, %ecx | |
andl $65280, %edx | |
orl %ecx, %eax | |
orl %edx, %eax | |
movl %eax, (%rdi) | |
addq $4, %rdi | |
cmpq %rsi, %r8 | |
jne .L25 | |
.L23: | |
rep | |
ret | |
.cfi_endproc | |
.LFE19: | |
.size words_fromBE32_B, .-words_fromBE32_B | |
.ident "GCC: (GNU) 4.7.0 20120324 (prerelease)" | |
.section .note.GNU-stack,"",@progbits | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment