Last active
September 16, 2023 12:43
-
-
Save aronson/07e8aa5b522ac43bf2becb894519941e to your computer and use it in GitHub Desktop.
Thoughts on accurate ARM NEON SDL alpha blending blitters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <arm_neon.h> | |
#include <SDL.h> | |
/** | |
* NEON operates at up to 128 bit width and supports mostly the same operations BlitNtoNPixelAlpha_SSE4_1 and _AVX2 use | |
* save for shuffling. | |
* We have several problems to solve, some easier than others. We need to reorder the input src pixel stream efficiently. | |
* At 128 bits this is 4-wide, similar to SSE4.1. We can assemble lookup tables that NEON will use to place the data | |
* appropriately into a new vector, instead of shuffling in-place. | |
* We need to use a similar approach to extract alpha dynamically to splat it as the extract functions want immediate | |
* integers AFAICT and switches for that against the SDL_PixelFormat->Ashift struct member is a performance nightmare. | |
* From there MixRGBA_NEON is nearly the same as the SSE4.1 intrinsic implementation. Missing from this code is the | |
* required step where alpha is set to 0xFF in all src pixel input and only preserved in the input mask for channel multiply. | |
* | |
* Benchmarks indicate a jump from ~20 FPS in the optimized scalar form to over 90 FPS on an M1 Max under the latest macOS | |
* and Apple Clang using `-O3`. | |
*/ | |
// vtable fetches alpha from input pixels based on format for splat | |
uint8x8_t get_alpha_vtable(SDL_PixelFormat* dstfmt) { | |
// Create a table lookup pattern to extract the alpha values | |
uint8_t pattern_data[8] = {0, 0, 0, 0, 0, 0, 0, 0}; | |
pattern_data[dstfmt->Ashift / 8] = 1; | |
return vld1_u8(pattern_data); | |
} | |
// uses above vtable to pull alpha into vector before splat | |
uint8x16_t extract_alpha_values(uint8x16_t input_colors, uint8x8_t pattern) { | |
// Extract the alpha values using the table lookup pattern | |
uint8x8_t low_alpha = vtbl1_u8(vget_low_u8(input_colors), pattern); | |
uint8x8_t high_alpha = vtbl1_u8(vget_high_u8(input_colors), pattern); | |
uint8x16_t extracted_alpha = vcombine_u8(low_alpha, high_alpha); | |
return extracted_alpha; | |
} | |
// splats alpha for MixRGBA_NEON input | |
uint8x16_t splat_alpha_to_new_vector(uint8x16_t reordered_colors, uint8x8_t alpha_mask) { | |
// Extract the alpha values from the reordered colors | |
uint8x16_t alpha_values = extract_alpha_values(reordered_colors, alpha_mask); | |
// Duplicate the alpha values across all bytes of each 32-bit integer | |
uint8x8_t low_alpha = vget_low_u8(alpha_values); | |
uint8x8_t high_alpha = vget_high_u8(alpha_values); | |
uint8x16_t splatted_alpha = vcombine_u8(low_alpha, high_alpha); | |
return splatted_alpha; | |
} | |
// Helper | |
uint8x8x2_t convert_uint8x16_to_uint8x8x2(uint8x16_t input) { | |
uint8x8x2_t output; | |
output.val[0] = vget_low_u8(input); | |
output.val[1] = vget_high_u8(input); | |
return output; | |
} | |
// Produce a vtable defining the color reorder operation | |
uint8x8_t generate_reorder_vtable(const SDL_PixelFormat *srcfmt, const SDL_PixelFormat *dstfmt) { | |
uint8_t shuffle_mask[8]; | |
for (int i = 0; i < 2; ++i) { | |
shuffle_mask[dstfmt->Ashift / 8 + i * 4] = srcfmt->Ashift / 8 + i * 4; | |
shuffle_mask[dstfmt->Rshift / 8 + i * 4] = srcfmt->Rshift / 8 + i * 4; | |
shuffle_mask[dstfmt->Gshift / 8 + i * 4] = srcfmt->Gshift / 8 + i * 4; | |
shuffle_mask[dstfmt->Bshift / 8 + i * 4] = srcfmt->Bshift / 8 + i * 4; | |
} | |
return vld1_u8(shuffle_mask); | |
} | |
// Reorder the pixels such that the source pixel stream is aligned to a destination format implied by the pattern | |
uint8x16_t reorder_pixels_argb8888_to_dstfmt(const uint8x16_t src_pixels, const uint8x8_t pattern) { | |
// Load four src pixels into two 64-bit NEON registers | |
uint8x8x2_t src_data = convert_uint8x16_to_uint8x8x2(src_pixels); | |
// Apply the vtable lookup function on the two registers to generate dstfmt ordered pixel data | |
uint8x8_t low = vtbl1_u8(src_data.val[0], pattern); | |
uint8x8_t high = vtbl1_u8(src_data.val[1], pattern); | |
// Pack both results back into a 128-bit NEON register and return it | |
return vcombine_u8(low, high); | |
} | |
// Blend four pixels of 32 ARGB data -- missing alpha saturate step | |
uint8x16_t MixRGBA_NEON(uint8x16_t sC, uint8x16_t dC, uint8x16_t sA) { | |
// Calculate (sC - dC) * sA | |
uint16x8_t diff_lo = vmull_u8(vget_low_u8(sC), vget_low_u8(sA)); | |
uint16x8_t diff_hi = vmull_u8(vget_high_u8(sC), vget_high_u8(sA)); | |
diff_lo = vsubq_u16(diff_lo, vmull_u8(vget_low_u8(dC), vget_low_u8(sA))); | |
diff_hi = vsubq_u16(diff_hi, vmull_u8(vget_high_u8(dC), vget_high_u8(sA))); | |
// Calculate (dC << 8) - dC | |
uint16x8_t dC_lo = vmovl_u8(vget_low_u8(dC)); | |
uint16x8_t dC_hi = vmovl_u8(vget_high_u8(dC)); | |
dC_lo = vsubq_u16(vshlq_n_u16(dC_lo, 8), dC_lo); | |
dC_hi = vsubq_u16(vshlq_n_u16(dC_hi, 8), dC_hi); | |
// Add the two results and the constant 0x1 | |
uint16x8_t x_lo = vaddq_u16(vaddq_u16(diff_lo, dC_lo), vdupq_n_u16(0x1)); | |
uint16x8_t x_hi = vaddq_u16(vaddq_u16(diff_hi, dC_hi), vdupq_n_u16(0x1)); | |
// Add x >> 8 to x and then shift the result right by 8 bits | |
x_lo = vaddq_u16(x_lo, vshrq_n_u16(x_lo, 8)); | |
x_hi = vaddq_u16(x_hi, vshrq_n_u16(x_hi, 8)); | |
x_lo = vshrq_n_u16(x_lo, 8); | |
x_hi = vshrq_n_u16(x_hi, 8); | |
// Combine the results and return | |
return vcombine_u8(vmovn_u16(x_lo), vmovn_u16(x_hi)); | |
} | |
// Snippet demonstrating plugging this in to BlitNtoNPixelAlpha, very buggy impl at this time | |
#ifdef SDL_NEON_INTRINSICS | |
if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasNEON()) { | |
uint8x8_t shuffle_mask = generate_vtable(srcfmt, dstfmt); | |
uint8x8_t alpha_mask = get_alpha_mask(dstfmt); | |
int chunks = width / 4; | |
while (height--) { | |
for (int i = 0; i < chunks; i += 1) { | |
uint8x16_t colors = vld1q_u8(src + i * 16); | |
colors = reorder_pixels_argb8888_to_dstfmt(colors, shuffle_mask); | |
uint8x16_t dst_colors = vld1q_u8(dst + i * 16); | |
uint8x16_t alpha_splat = splat_alpha_to_new_vector(colors, alpha_mask); | |
uint8x16_t mixed_colors = MixRGBA_NEON(colors, dst_colors, alpha_splat); | |
vst1q_u8(dst + i * 16, mixed_colors); | |
} | |
// Handle remaining pixels when width is not a multiple of 4 | |
if (width % 4 != 0) { | |
int remaining_pixels = width % 4; | |
int offset = width - remaining_pixels; | |
for (int i = offset; i < width; i++) { | |
DISEMBLE_RGBA(src + i * 4, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); | |
if (sA) { | |
DISEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); | |
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); | |
ASSEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, dR, dG, dB, dA); | |
} | |
} | |
} | |
src += 4 * width; | |
dst += 4 * width; | |
src += srcskip; | |
dst += dstskip; | |
} | |
return; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment