-
-
Save tonykwok/6c41edf8c5c4e4c9acb4b78c7635cbe5 to your computer and use it in GitHub Desktop.
BGRA to RGBA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Really awesome code taken from: http://apangborn.com/2011/05/pixel-processing-using-arm-assembly/ | |
inline static void neon_rgba_to_bgra(unsigned char *src, unsigned char *dst, int numPixels) | |
{ | |
#ifdef __ARM_NEON__ | |
int simd_pixels = numPixels & ~7; // round down to nearest 8 | |
int simd_iterations = simd_pixels >> 3; | |
int col; | |
if(simd_iterations) { // make sure at least 1 iteration | |
__asm__ __volatile__ ("1: \n\t" | |
// structured load of 8 pixels into d0-d3 (64-bit) NEON registers | |
"vld4.8 {d0, d1, d2, d3}, [%[source]]! \n\t" // the "!" increments the pointer by number of bytes read | |
"vswp d0, d2 \n\t" // swap registers d0 and d2 (swaps red and blue, 8 pixels at a time) | |
"vst4.8 {d0, d1, d2, d3}, [%[dest]]! \n\t" // structured store the 8 pixels back, the "!" increments the pointer by number of bytes written | |
"subs %[iterations],%[iterations],#1 \n\t" | |
"bne 1b" // jump to label "1", "b" suffix means the jump is back/behind the current statement | |
: [source]"+r"(src), [dest] "+r"(dst), [iterations]"+r"(simd_iterations) // output parameters, we list read-write, "+", value as outputs. Read-write so that the auto-increment actually affects the 'src' and 'dst' | |
: // no input parameters, they're all read-write so we put them in the output parameters | |
: "memory", "d0", "d1", "d2", "d3" // clobbered registers | |
); | |
} | |
// swap the leftover pixels | |
// assembly with "load word, rotate right 1 byte, reverse word, store word" might be faster - exercise for the ambitious reader | |
unsigned char r; | |
for (col = simd_pixels; col < numPixels; col++, src += 4, dst += 4) { | |
r = src[0]; | |
dst[0] = src[2]; | |
dst[1] = src[1]; | |
dst[2] = r; | |
dst[3] = src[3]; | |
} | |
#else | |
for (NSUInteger i = 0; i < numPixels; i += 4) { | |
char temp = src[i]; | |
dst[i] = src[i + 2]; | |
dts[i + 2] = temp; | |
} | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment