johnbartholomew · April 4, 2012 20:48
diff --git a/byte-order-test.c b/byte-order-test.c
 #include <string.h>
 #include <stdint.h>

 /* compiled with:
 * {gcc,clang} -std=c89 -pedantic -Wall -Wextra -O3 -S -c byte-order-test.c
 *
 * Built on and for an x86-64 system.
 * The full assembly output from both compilers is embedded at the bottom.
 *
 * Summary:
 *  - Clang gets it right (100%), for some code structures, but not others,
 *    recognising both a straight load, and a 32-bit endian-swap.
 *  - GCC 4.7.0 gets it partially right, for some code structures but not others,
 *    recognising a straight load, but not a 32-bit endian-swap
 *    (or conceivably recognising an endian-swap but deciding that the bswap
 *     instruction is slower than explicitly coded movs and logic ops;
 *     I don't know whether that's true for typical x86-64 architectures)
 *    However, it doesn't always remove the loop, even in a couple of cases
 *    where it eliminates the memory -> register -> memory step.
 */

 /* ---- A versions ---- */

 /* clang 3: recognises this as a load
 * gcc 4.7: recognises this as a load */
 uint32_t fromLE32_A(uint32_t word) {
    uint8_t x[4];
    memcpy(x, &word, 4);
    return
        ((uint32_t)(x[0]) <<  0) |
        ((uint32_t)(x[1]) <<  8) |
        ((uint32_t)(x[2]) << 16) |
        ((uint32_t)(x[3]) << 24);
 }

 /* clang 3: uses the bswap instruction
 * gcc 4.7: uses movs, shifts and ors */
 uint32_t fromBE32_A(uint32_t word) {
    uint8_t x[4];
    memcpy(x, &word, 4);
    return
        ((uint32_t)(x[0]) << 24) |
        ((uint32_t)(x[1]) << 16) |
        ((uint32_t)(x[2]) <<  8) |
        ((uint32_t)(x[3]) <<  0);
 }

 /* clang 3: turns this into an empty function (just a ret)
 * gcc 4.7: removes the loop body but keeps the loop (!) */
 void words_fromLE32_A(uint32_t *words, size_t n) {
    size_t i;
    for (i = 0; i < n; ++i) {
        words[i] = fromLE32_A(words[i]);
    }
 }

 /* clang 3: turns this into a bswap loop
 * gcc 4.7: loop containing movs, shifts and ors */
 void words_fromBE32_A(uint32_t *words, size_t n) {
    size_t i;
    for (i = 0; i < n; ++i) {
        words[i] = fromBE32_A(words[i]);
    }
 }

 /* ---- B versions ---- */

 /* clang 3: uses movs, shifts and ors
 * gcc 4.7: uses movs, shifts and ors */
 uint32_t fromLE32_B(uint8_t *x) {
    return
        ((uint32_t)(x[0]) <<  0) |
        ((uint32_t)(x[1]) <<  8) |
        ((uint32_t)(x[2]) << 16) |
        ((uint32_t)(x[3]) << 24);
 }

 /* clang 3: uses movs, shifts and ors
 * gcc 4.7: uses movs, shifts and ors */
 uint32_t fromBE32_B(uint8_t *x) {
    return
        ((uint32_t)(x[0]) << 24) |
        ((uint32_t)(x[1]) << 16) |
        ((uint32_t)(x[2]) <<  8) |
        ((uint32_t)(x[3]) <<  0);
 }

 /* clang 3: empty function (just a ret)
 * gcc 4.7: removes the loop body, but keeps the loop (!) */
 void words_fromLE32_B(uint32_t *words, size_t n) {
    uint8_t bytes[4];
    size_t i;
    for (i = 0; i < n; ++i) {
        memcpy(bytes, &words[i], 4);
        words[i] = fromLE32_B(bytes);
    }
 }

 /* clang 3: turns this into a bswap loop
 * gcc 4.7: loop containing movs, shifts and ors */
 void words_fromBE32_B(uint32_t *words, size_t n) {
    uint8_t bytes[4];
    size_t i;
    for (i = 0; i < n; ++i) {
        memcpy(bytes, &words[i], 4);
        words[i] = fromBE32_B(bytes);
    }
 }

 /* ----------------------------------------------------- */
 /* clang 3.0 output */
 #if 0
    .file   "byte-order-test.c"
    .text
    .globl  fromLE32_A
    .align  16, 0x90
    .type   fromLE32_A,@function
 fromLE32_A:                             # @fromLE32_A
 .Ltmp0:
    .cfi_startproc
 # BB#0:
    movl    %edi, %eax
    ret
 .Ltmp1:
    .size   fromLE32_A, .Ltmp1-fromLE32_A
 .Ltmp2:
    .cfi_endproc
 .Leh_func_end0:

    .globl  fromBE32_A
    .align  16, 0x90
    .type   fromBE32_A,@function
 fromBE32_A:                             # @fromBE32_A
 .Ltmp3:
    .cfi_startproc
 # BB#0:
    bswapl  %edi
    movl    %edi, %eax
    ret
 .Ltmp4:
    .size   fromBE32_A, .Ltmp4-fromBE32_A
 .Ltmp5:
    .cfi_endproc
 .Leh_func_end1:

    .globl  words_fromLE32_A
    .align  16, 0x90
    .type   words_fromLE32_A,@function
 words_fromLE32_A:                       # @words_fromLE32_A
 .Ltmp6:
    .cfi_startproc
 # BB#0:                                 # %.lr.ph
    ret
 .Ltmp7:
    .size   words_fromLE32_A, .Ltmp7-words_fromLE32_A
 .Ltmp8:
    .cfi_endproc
 .Leh_func_end2:

    .globl  words_fromBE32_A
    .align  16, 0x90
    .type   words_fromBE32_A,@function
 words_fromBE32_A:                       # @words_fromBE32_A
 .Ltmp9:
    .cfi_startproc
 # BB#0:
    testq   %rsi, %rsi
    je  .LBB3_2
    .align  16, 0x90
 .LBB3_1:                                # %.lr.ph
                                        # =>This Inner Loop Header: Depth=1
    movl    (%rdi), %eax
    bswapl  %eax
    movl    %eax, (%rdi)
    addq    $4, %rdi
    decq    %rsi
    jne .LBB3_1
 .LBB3_2:                                # %._crit_edge
    ret
 .Ltmp10:
    .size   words_fromBE32_A, .Ltmp10-words_fromBE32_A
 .Ltmp11:
    .cfi_endproc
 .Leh_func_end3:

    .globl  fromLE32_B
    .align  16, 0x90
    .type   fromLE32_B,@function
 fromLE32_B:                             # @fromLE32_B
 .Ltmp12:
    .cfi_startproc
 # BB#0:
    movzbl  (%rdi), %ecx
    movzbl  1(%rdi), %eax
    shll    $8, %eax
    orl %ecx, %eax
    movzbl  2(%rdi), %ecx
    shll    $16, %ecx
    orl %eax, %ecx
    movzbl  3(%rdi), %eax
    shll    $24, %eax
    orl %ecx, %eax
    ret
 .Ltmp13:
    .size   fromLE32_B, .Ltmp13-fromLE32_B
 .Ltmp14:
    .cfi_endproc
 .Leh_func_end4:

    .globl  fromBE32_B
    .align  16, 0x90
    .type   fromBE32_B,@function
 fromBE32_B:                             # @fromBE32_B
 .Ltmp15:
    .cfi_startproc
 # BB#0:
    movzbl  (%rdi), %ecx
    shll    $24, %ecx
    movzbl  1(%rdi), %eax
    shll    $16, %eax
    orl %ecx, %eax
    movzbl  3(%rdi), %ecx
    orl %eax, %ecx
    movzbl  2(%rdi), %eax
    shll    $8, %eax
    orl %ecx, %eax
    ret
 .Ltmp16:
    .size   fromBE32_B, .Ltmp16-fromBE32_B
 .Ltmp17:
    .cfi_endproc
 .Leh_func_end5:

    .globl  words_fromLE32_B
    .align  16, 0x90
    .type   words_fromLE32_B,@function
 words_fromLE32_B:                       # @words_fromLE32_B
 .Ltmp18:
    .cfi_startproc
 # BB#0:                                 # %.lr.ph
    ret
 .Ltmp19:
    .size   words_fromLE32_B, .Ltmp19-words_fromLE32_B
 .Ltmp20:
    .cfi_endproc
 .Leh_func_end6:

    .globl  words_fromBE32_B
    .align  16, 0x90
    .type   words_fromBE32_B,@function
 words_fromBE32_B:                       # @words_fromBE32_B
 .Ltmp21:
    .cfi_startproc
 # BB#0:
    testq   %rsi, %rsi
    je  .LBB7_2
    .align  16, 0x90
 .LBB7_1:                                # %.lr.ph
                                        # =>This Inner Loop Header: Depth=1
    movl    (%rdi), %eax
    bswapl  %eax
    movl    %eax, (%rdi)
    addq    $4, %rdi
    decq    %rsi
    jne .LBB7_1
 .LBB7_2:                                # %._crit_edge
    ret
 .Ltmp22:
    .size   words_fromBE32_B, .Ltmp22-words_fromBE32_B
 .Ltmp23:
    .cfi_endproc
 .Leh_func_end7:


    .section    ".note.GNU-stack","",@progbits
 #endif

 /* gcc 4.7.0 output */
 #if 0
    .file   "byte-order-test.c"
    .text
    .p2align 4,,15
    .globl  fromLE32_A
    .type   fromLE32_A, @function
 fromLE32_A:
 .LFB12:
    .cfi_startproc
    movl    %edi, %eax
    ret
    .cfi_endproc
 .LFE12:
    .size   fromLE32_A, .-fromLE32_A
    .p2align 4,,15
    .globl  fromBE32_A
    .type   fromBE32_A, @function
 fromBE32_A:
 .LFB13:
    .cfi_startproc
    movq    %rdi, %rcx
    movl    %edi, %eax
    movzbl  %ch, %edx
    sall    $24, %eax
    sall    $16, %edx
    orl %edx, %eax
    movl    %edi, %edx
    shrl    $8, %edi
    shrl    $24, %edx
    andl    $65280, %edi
    orl %edx, %eax
    orl %edi, %eax
    ret
    .cfi_endproc
 .LFE13:
    .size   fromBE32_A, .-fromBE32_A
    .p2align 4,,15
    .globl  words_fromLE32_A
    .type   words_fromLE32_A, @function
 words_fromLE32_A:
 .LFB14:
    .cfi_startproc
    testq   %rsi, %rsi
    leaq    (%rdi,%rsi,4), %rax
    je  .L3
    .p2align 4,,10
    .p2align 3
 .L7:
    addq    $4, %rdi
    cmpq    %rax, %rdi
    jne .L7
 .L3:
    rep
    ret
    .cfi_endproc
 .LFE14:
    .size   words_fromLE32_A, .-words_fromLE32_A
    .p2align 4,,15
    .globl  words_fromBE32_A
    .type   words_fromBE32_A, @function
 words_fromBE32_A:
 .LFB15:
    .cfi_startproc
    testq   %rsi, %rsi
    leaq    (%rdi,%rsi,4), %r8
    je  .L10
    .p2align 4,,10
    .p2align 3
 .L14:
    movl    (%rdi), %edx
    movl    %edx, %eax
    movzbl  %dh, %ecx
    sall    $16, %ecx
    sall    $24, %eax
    orl %ecx, %eax
    movl    %edx, %ecx
    shrl    $8, %edx
    shrl    $24, %ecx
    andl    $65280, %edx
    orl %ecx, %eax
    orl %edx, %eax
    movl    %eax, (%rdi)
    addq    $4, %rdi
    cmpq    %r8, %rdi
    jne .L14
 .L10:
    rep
    ret
    .cfi_endproc
 .LFE15:
    .size   words_fromBE32_A, .-words_fromBE32_A
    .p2align 4,,15
    .globl  fromLE32_B
    .type   fromLE32_B, @function
 fromLE32_B:
 .LFB16:
    .cfi_startproc
    movzbl  1(%rdi), %eax
    movzbl  2(%rdi), %edx
    sall    $8, %eax
    sall    $16, %edx
    orl %edx, %eax
    movzbl  (%rdi), %edx
    orl %edx, %eax
    movzbl  3(%rdi), %edx
    sall    $24, %edx
    orl %edx, %eax
    ret
    .cfi_endproc
 .LFE16:
    .size   fromLE32_B, .-fromLE32_B
    .p2align 4,,15
    .globl  fromBE32_B
    .type   fromBE32_B, @function
 fromBE32_B:
 .LFB17:
    .cfi_startproc
    movzbl  (%rdi), %eax
    movzbl  1(%rdi), %edx
    sall    $24, %eax
    sall    $16, %edx
    orl %edx, %eax
    movzbl  3(%rdi), %edx
    orl %edx, %eax
    movzbl  2(%rdi), %edx
    sall    $8, %edx
    orl %edx, %eax
    ret
    .cfi_endproc
 .LFE17:
    .size   fromBE32_B, .-fromBE32_B
    .p2align 4,,15
    .globl  words_fromLE32_B
    .type   words_fromLE32_B, @function
 words_fromLE32_B:
 .LFB18:
    .cfi_startproc
    xorl    %eax, %eax
    testq   %rsi, %rsi
    je  .L18
    .p2align 4,,10
    .p2align 3
 .L20:
    addq    $1, %rax
    cmpq    %rsi, %rax
    jne .L20
 .L18:
    rep
    ret
    .cfi_endproc
 .LFE18:
    .size   words_fromLE32_B, .-words_fromLE32_B
    .p2align 4,,15
    .globl  words_fromBE32_B
    .type   words_fromBE32_B, @function
 words_fromBE32_B:
 .LFB19:
    .cfi_startproc
    xorl    %r8d, %r8d
    testq   %rsi, %rsi
    je  .L23
    .p2align 4,,10
    .p2align 3
 .L25:
    movl    (%rdi), %edx
    addq    $1, %r8
    movl    %edx, %eax
    movzbl  %dh, %ecx
    sall    $16, %ecx
    sall    $24, %eax
    orl %ecx, %eax
    movl    %edx, %ecx
    shrl    $8, %edx
    shrl    $24, %ecx
    andl    $65280, %edx
    orl %ecx, %eax
    orl %edx, %eax
    movl    %eax, (%rdi)
    addq    $4, %rdi
    cmpq    %rsi, %r8
    jne .L25
 .L23:
    rep
    ret
    .cfi_endproc
 .LFE19:
    .size   words_fromBE32_B, .-words_fromBE32_B
    .ident  "GCC: (GNU) 4.7.0 20120324 (prerelease)"
    .section    .note.GNU-stack,"",@progbits
 #endif
	#include <string.h>
	#include <stdint.h>

	/* compiled with:
	* {gcc,clang} -std=c89 -pedantic -Wall -Wextra -O3 -S -c byte-order-test.c
	*
	* Built on and for an x86-64 system.
	* The full assembly output from both compilers is embedded at the bottom.
	*
	* Summary:
	* - Clang gets it right (100%), for some code structures, but not others,
	* recognising both a straight load, and a 32-bit endian-swap.
	* - GCC 4.7.0 gets it partially right, for some code structures but not others,
	* recognising a straight load, but not a 32-bit endian-swap
	* (or conceivably recognising an endian-swap but deciding that the bswap
	* instruction is slower than explicitly coded movs and logic ops;
	* I don't know whether that's true for typical x86-64 architectures)
	* However, it doesn't always remove the loop, even in a couple of cases
	* where it eliminates the memory -> register -> memory step.
	*/

	/* ---- A versions ---- */

	/* clang 3: recognises this as a load
	* gcc 4.7: recognises this as a load */
	uint32_t fromLE32_A(uint32_t word) {
	uint8_t x[4];
	memcpy(x, &word, 4);
	return
	((uint32_t)(x[0]) << 0) \|
	((uint32_t)(x[1]) << 8) \|
	((uint32_t)(x[2]) << 16) \|
	((uint32_t)(x[3]) << 24);
	}

	/* clang 3: uses the bswap instruction
	* gcc 4.7: uses movs, shifts and ors */
	uint32_t fromBE32_A(uint32_t word) {
	uint8_t x[4];
	memcpy(x, &word, 4);
	return
	((uint32_t)(x[0]) << 24) \|
	((uint32_t)(x[1]) << 16) \|
	((uint32_t)(x[2]) << 8) \|
	((uint32_t)(x[3]) << 0);
	}

	/* clang 3: turns this into an empty function (just a ret)
	* gcc 4.7: removes the loop body but keeps the loop (!) */
	void words_fromLE32_A(uint32_t *words, size_t n) {
	size_t i;
	for (i = 0; i < n; ++i) {
	words[i] = fromLE32_A(words[i]);
	}
	}

	/* clang 3: turns this into a bswap loop
	* gcc 4.7: loop containing movs, shifts and ors */
	void words_fromBE32_A(uint32_t *words, size_t n) {
	size_t i;
	for (i = 0; i < n; ++i) {
	words[i] = fromBE32_A(words[i]);
	}
	}

	/* ---- B versions ---- */

	/* clang 3: uses movs, shifts and ors
	* gcc 4.7: uses movs, shifts and ors */
	uint32_t fromLE32_B(uint8_t *x) {
	return
	((uint32_t)(x[0]) << 0) \|
	((uint32_t)(x[1]) << 8) \|
	((uint32_t)(x[2]) << 16) \|
	((uint32_t)(x[3]) << 24);
	}

	/* clang 3: uses movs, shifts and ors
	* gcc 4.7: uses movs, shifts and ors */
	uint32_t fromBE32_B(uint8_t *x) {
	return
	((uint32_t)(x[0]) << 24) \|
	((uint32_t)(x[1]) << 16) \|
	((uint32_t)(x[2]) << 8) \|
	((uint32_t)(x[3]) << 0);
	}

	/* clang 3: empty function (just a ret)
	* gcc 4.7: removes the loop body, but keeps the loop (!) */
	void words_fromLE32_B(uint32_t *words, size_t n) {
	uint8_t bytes[4];
	size_t i;
	for (i = 0; i < n; ++i) {
	memcpy(bytes, &words[i], 4);
	words[i] = fromLE32_B(bytes);
	}
	}

	/* clang 3: turns this into a bswap loop
	* gcc 4.7: loop containing movs, shifts and ors */
	void words_fromBE32_B(uint32_t *words, size_t n) {
	uint8_t bytes[4];
	size_t i;
	for (i = 0; i < n; ++i) {
	memcpy(bytes, &words[i], 4);
	words[i] = fromBE32_B(bytes);
	}
	}

	/* ----------------------------------------------------- */
	/* clang 3.0 output */
	#if 0
	.file "byte-order-test.c"
	.text
	.globl fromLE32_A
	.align 16, 0x90
	.type fromLE32_A,@function
	fromLE32_A: # @fromLE32_A
	.Ltmp0:
	.cfi_startproc
	# BB#0:
	movl %edi, %eax
	ret
	.Ltmp1:
	.size fromLE32_A, .Ltmp1-fromLE32_A
	.Ltmp2:
	.cfi_endproc
	.Leh_func_end0:

	.globl fromBE32_A
	.align 16, 0x90
	.type fromBE32_A,@function
	fromBE32_A: # @fromBE32_A
	.Ltmp3:
	.cfi_startproc
	# BB#0:
	bswapl %edi
	movl %edi, %eax
	ret
	.Ltmp4:
	.size fromBE32_A, .Ltmp4-fromBE32_A
	.Ltmp5:
	.cfi_endproc
	.Leh_func_end1:

	.globl words_fromLE32_A
	.align 16, 0x90
	.type words_fromLE32_A,@function
	words_fromLE32_A: # @words_fromLE32_A
	.Ltmp6:
	.cfi_startproc
	# BB#0: # %.lr.ph
	ret
	.Ltmp7:
	.size words_fromLE32_A, .Ltmp7-words_fromLE32_A
	.Ltmp8:
	.cfi_endproc
	.Leh_func_end2:

	.globl words_fromBE32_A
	.align 16, 0x90
	.type words_fromBE32_A,@function
	words_fromBE32_A: # @words_fromBE32_A
	.Ltmp9:
	.cfi_startproc
	# BB#0:
	testq %rsi, %rsi
	je .LBB3_2
	.align 16, 0x90
	.LBB3_1: # %.lr.ph
	# =>This Inner Loop Header: Depth=1
	movl (%rdi), %eax
	bswapl %eax
	movl %eax, (%rdi)
	addq $4, %rdi
	decq %rsi
	jne .LBB3_1
	.LBB3_2: # %._crit_edge
	ret
	.Ltmp10:
	.size words_fromBE32_A, .Ltmp10-words_fromBE32_A
	.Ltmp11:
	.cfi_endproc
	.Leh_func_end3:

	.globl fromLE32_B
	.align 16, 0x90
	.type fromLE32_B,@function
	fromLE32_B: # @fromLE32_B
	.Ltmp12:
	.cfi_startproc
	# BB#0:
	movzbl (%rdi), %ecx
	movzbl 1(%rdi), %eax
	shll $8, %eax
	orl %ecx, %eax
	movzbl 2(%rdi), %ecx
	shll $16, %ecx
	orl %eax, %ecx
	movzbl 3(%rdi), %eax
	shll $24, %eax
	orl %ecx, %eax
	ret
	.Ltmp13:
	.size fromLE32_B, .Ltmp13-fromLE32_B
	.Ltmp14:
	.cfi_endproc
	.Leh_func_end4:

	.globl fromBE32_B
	.align 16, 0x90
	.type fromBE32_B,@function
	fromBE32_B: # @fromBE32_B
	.Ltmp15:
	.cfi_startproc
	# BB#0:
	movzbl (%rdi), %ecx
	shll $24, %ecx
	movzbl 1(%rdi), %eax
	shll $16, %eax
	orl %ecx, %eax
	movzbl 3(%rdi), %ecx
	orl %eax, %ecx
	movzbl 2(%rdi), %eax
	shll $8, %eax
	orl %ecx, %eax
	ret
	.Ltmp16:
	.size fromBE32_B, .Ltmp16-fromBE32_B
	.Ltmp17:
	.cfi_endproc
	.Leh_func_end5:

	.globl words_fromLE32_B
	.align 16, 0x90
	.type words_fromLE32_B,@function
	words_fromLE32_B: # @words_fromLE32_B
	.Ltmp18:
	.cfi_startproc
	# BB#0: # %.lr.ph
	ret
	.Ltmp19:
	.size words_fromLE32_B, .Ltmp19-words_fromLE32_B
	.Ltmp20:
	.cfi_endproc
	.Leh_func_end6:

	.globl words_fromBE32_B
	.align 16, 0x90
	.type words_fromBE32_B,@function
	words_fromBE32_B: # @words_fromBE32_B
	.Ltmp21:
	.cfi_startproc
	# BB#0:
	testq %rsi, %rsi
	je .LBB7_2
	.align 16, 0x90
	.LBB7_1: # %.lr.ph
	# =>This Inner Loop Header: Depth=1
	movl (%rdi), %eax
	bswapl %eax
	movl %eax, (%rdi)
	addq $4, %rdi
	decq %rsi
	jne .LBB7_1
	.LBB7_2: # %._crit_edge
	ret
	.Ltmp22:
	.size words_fromBE32_B, .Ltmp22-words_fromBE32_B
	.Ltmp23:
	.cfi_endproc
	.Leh_func_end7:


	.section ".note.GNU-stack","",@progbits
	#endif

	/* gcc 4.7.0 output */
	#if 0
	.file "byte-order-test.c"
	.text
	.p2align 4,,15
	.globl fromLE32_A
	.type fromLE32_A, @function
	fromLE32_A:
	.LFB12:
	.cfi_startproc
	movl %edi, %eax
	ret
	.cfi_endproc
	.LFE12:
	.size fromLE32_A, .-fromLE32_A
	.p2align 4,,15
	.globl fromBE32_A
	.type fromBE32_A, @function
	fromBE32_A:
	.LFB13:
	.cfi_startproc
	movq %rdi, %rcx
	movl %edi, %eax
	movzbl %ch, %edx
	sall $24, %eax
	sall $16, %edx
	orl %edx, %eax
	movl %edi, %edx
	shrl $8, %edi
	shrl $24, %edx
	andl $65280, %edi
	orl %edx, %eax
	orl %edi, %eax
	ret
	.cfi_endproc
	.LFE13:
	.size fromBE32_A, .-fromBE32_A
	.p2align 4,,15
	.globl words_fromLE32_A
	.type words_fromLE32_A, @function
	words_fromLE32_A:
	.LFB14:
	.cfi_startproc
	testq %rsi, %rsi
	leaq (%rdi,%rsi,4), %rax
	je .L3
	.p2align 4,,10
	.p2align 3
	.L7:
	addq $4, %rdi
	cmpq %rax, %rdi
	jne .L7
	.L3:
	rep
	ret
	.cfi_endproc
	.LFE14:
	.size words_fromLE32_A, .-words_fromLE32_A
	.p2align 4,,15
	.globl words_fromBE32_A
	.type words_fromBE32_A, @function
	words_fromBE32_A:
	.LFB15:
	.cfi_startproc
	testq %rsi, %rsi
	leaq (%rdi,%rsi,4), %r8
	je .L10
	.p2align 4,,10
	.p2align 3
	.L14:
	movl (%rdi), %edx
	movl %edx, %eax
	movzbl %dh, %ecx
	sall $16, %ecx
	sall $24, %eax
	orl %ecx, %eax
	movl %edx, %ecx
	shrl $8, %edx
	shrl $24, %ecx
	andl $65280, %edx
	orl %ecx, %eax
	orl %edx, %eax
	movl %eax, (%rdi)
	addq $4, %rdi
	cmpq %r8, %rdi
	jne .L14
	.L10:
	rep
	ret
	.cfi_endproc
	.LFE15:
	.size words_fromBE32_A, .-words_fromBE32_A
	.p2align 4,,15
	.globl fromLE32_B
	.type fromLE32_B, @function
	fromLE32_B:
	.LFB16:
	.cfi_startproc
	movzbl 1(%rdi), %eax
	movzbl 2(%rdi), %edx
	sall $8, %eax
	sall $16, %edx
	orl %edx, %eax
	movzbl (%rdi), %edx
	orl %edx, %eax
	movzbl 3(%rdi), %edx
	sall $24, %edx
	orl %edx, %eax
	ret
	.cfi_endproc
	.LFE16:
	.size fromLE32_B, .-fromLE32_B
	.p2align 4,,15
	.globl fromBE32_B
	.type fromBE32_B, @function
	fromBE32_B:
	.LFB17:
	.cfi_startproc
	movzbl (%rdi), %eax
	movzbl 1(%rdi), %edx
	sall $24, %eax
	sall $16, %edx
	orl %edx, %eax
	movzbl 3(%rdi), %edx
	orl %edx, %eax
	movzbl 2(%rdi), %edx
	sall $8, %edx
	orl %edx, %eax
	ret
	.cfi_endproc
	.LFE17:
	.size fromBE32_B, .-fromBE32_B
	.p2align 4,,15
	.globl words_fromLE32_B
	.type words_fromLE32_B, @function
	words_fromLE32_B:
	.LFB18:
	.cfi_startproc
	xorl %eax, %eax
	testq %rsi, %rsi
	je .L18
	.p2align 4,,10
	.p2align 3
	.L20:
	addq $1, %rax
	cmpq %rsi, %rax
	jne .L20
	.L18:
	rep
	ret
	.cfi_endproc
	.LFE18:
	.size words_fromLE32_B, .-words_fromLE32_B
	.p2align 4,,15
	.globl words_fromBE32_B
	.type words_fromBE32_B, @function
	words_fromBE32_B:
	.LFB19:
	.cfi_startproc
	xorl %r8d, %r8d
	testq %rsi, %rsi
	je .L23
	.p2align 4,,10
	.p2align 3
	.L25:
	movl (%rdi), %edx
	addq $1, %r8
	movl %edx, %eax
	movzbl %dh, %ecx
	sall $16, %ecx
	sall $24, %eax
	orl %ecx, %eax
	movl %edx, %ecx
	shrl $8, %edx
	shrl $24, %ecx
	andl $65280, %edx
	orl %ecx, %eax
	orl %edx, %eax
	movl %eax, (%rdi)
	addq $4, %rdi
	cmpq %rsi, %r8
	jne .L25
	.L23:
	rep
	ret
	.cfi_endproc
	.LFE19:
	.size words_fromBE32_B, .-words_fromBE32_B
	.ident "GCC: (GNU) 4.7.0 20120324 (prerelease)"
	.section .note.GNU-stack,"",@progbits
	#endif