aronson · September 16, 2023 12:43
diff --git a/blend.c b/blend.c
 #include <arm_neon.h>
 #include <SDL.h>

 /**
  * NEON operates at up to 128 bit width and supports mostly the same operations BlitNtoNPixelAlpha_SSE4_1 and _AVX2 use
  * save for shuffling.
  * We have several problems to solve, some easier than others. We need to reorder the input src pixel stream efficiently.
  * At 128 bits this is 4-wide, similar to SSE4.1. We can assemble lookup tables that NEON will use to place the data 
  * appropriately into a new vector, instead of shuffling in-place.
  * We need to use a similar approach to extract alpha dynamically to splat it as the extract functions want immediate
  * integers AFAICT and switches for that against the SDL_PixelFormat->Ashift struct member is a performance nightmare.
  * From there MixRGBA_NEON is nearly the same as the SSE4.1 intrinsic implementation. Missing from this code is the
  * required step where alpha is set to 0xFF in all src pixel input and only preserved in the input mask for channel multiply.
  *
  * Benchmarks indicate a jump from ~20 FPS in the optimized scalar form to over 90 FPS on an M1 Max under the latest macOS 
  * and Apple Clang using `-O3`.
  */

 // vtable fetches alpha from input pixels based on format for splat
 uint8x8_t get_alpha_vtable(SDL_PixelFormat* dstfmt) {
    // Create a table lookup pattern to extract the alpha values
    uint8_t pattern_data[8] = {0, 0, 0, 0, 0, 0, 0, 0};
    pattern_data[dstfmt->Ashift / 8] = 1;
    return vld1_u8(pattern_data);
 }

 // uses above vtable to pull alpha into vector before splat
 uint8x16_t extract_alpha_values(uint8x16_t input_colors, uint8x8_t pattern) {
    // Extract the alpha values using the table lookup pattern
    uint8x8_t low_alpha = vtbl1_u8(vget_low_u8(input_colors), pattern);
    uint8x8_t high_alpha = vtbl1_u8(vget_high_u8(input_colors), pattern);
    uint8x16_t extracted_alpha = vcombine_u8(low_alpha, high_alpha);

    return extracted_alpha;
 }

 // splats alpha for MixRGBA_NEON input
 uint8x16_t splat_alpha_to_new_vector(uint8x16_t reordered_colors, uint8x8_t alpha_mask) {
    // Extract the alpha values from the reordered colors
    uint8x16_t alpha_values = extract_alpha_values(reordered_colors, alpha_mask);

    // Duplicate the alpha values across all bytes of each 32-bit integer
    uint8x8_t low_alpha = vget_low_u8(alpha_values);
    uint8x8_t high_alpha = vget_high_u8(alpha_values);
    uint8x16_t splatted_alpha = vcombine_u8(low_alpha, high_alpha);

    return splatted_alpha;
 }

 // Helper
 uint8x8x2_t convert_uint8x16_to_uint8x8x2(uint8x16_t input) {
    uint8x8x2_t output;
    output.val[0] = vget_low_u8(input);
    output.val[1] = vget_high_u8(input);
    return output;
 }

 // Produce a vtable defining the color reorder operation
 uint8x8_t generate_reorder_vtable(const SDL_PixelFormat *srcfmt, const SDL_PixelFormat *dstfmt) {
    uint8_t shuffle_mask[8];
    for (int i = 0; i < 2; ++i) {
        shuffle_mask[dstfmt->Ashift / 8 + i * 4] = srcfmt->Ashift / 8 + i * 4;
        shuffle_mask[dstfmt->Rshift / 8 + i * 4] = srcfmt->Rshift / 8 + i * 4;
        shuffle_mask[dstfmt->Gshift / 8 + i * 4] = srcfmt->Gshift / 8 + i * 4;
        shuffle_mask[dstfmt->Bshift / 8 + i * 4] = srcfmt->Bshift / 8 + i * 4;
    }
    return vld1_u8(shuffle_mask);
 }

 // Reorder the pixels such that the source pixel stream is aligned to a destination format implied by the pattern
 uint8x16_t reorder_pixels_argb8888_to_dstfmt(const uint8x16_t src_pixels, const uint8x8_t pattern) {
    // Load four src pixels into two 64-bit NEON registers
    uint8x8x2_t src_data = convert_uint8x16_to_uint8x8x2(src_pixels);

    // Apply the vtable lookup function on the two registers to generate dstfmt ordered pixel data
    uint8x8_t low = vtbl1_u8(src_data.val[0], pattern);
    uint8x8_t high = vtbl1_u8(src_data.val[1], pattern);

    // Pack both results back into a 128-bit NEON register and return it
    return vcombine_u8(low, high);
 }

 // Blend four pixels of 32 ARGB data -- missing alpha saturate step
 uint8x16_t MixRGBA_NEON(uint8x16_t sC, uint8x16_t dC, uint8x16_t sA) {
    // Calculate (sC - dC) * sA
    uint16x8_t diff_lo = vmull_u8(vget_low_u8(sC), vget_low_u8(sA));
    uint16x8_t diff_hi = vmull_u8(vget_high_u8(sC), vget_high_u8(sA));
    diff_lo = vsubq_u16(diff_lo, vmull_u8(vget_low_u8(dC), vget_low_u8(sA)));
    diff_hi = vsubq_u16(diff_hi, vmull_u8(vget_high_u8(dC), vget_high_u8(sA)));

    // Calculate (dC << 8) - dC
    uint16x8_t dC_lo = vmovl_u8(vget_low_u8(dC));
    uint16x8_t dC_hi = vmovl_u8(vget_high_u8(dC));
    dC_lo = vsubq_u16(vshlq_n_u16(dC_lo, 8), dC_lo);
    dC_hi = vsubq_u16(vshlq_n_u16(dC_hi, 8), dC_hi);

    // Add the two results and the constant 0x1
    uint16x8_t x_lo = vaddq_u16(vaddq_u16(diff_lo, dC_lo), vdupq_n_u16(0x1));
    uint16x8_t x_hi = vaddq_u16(vaddq_u16(diff_hi, dC_hi), vdupq_n_u16(0x1));

    // Add x >> 8 to x and then shift the result right by 8 bits
    x_lo = vaddq_u16(x_lo, vshrq_n_u16(x_lo, 8));
    x_hi = vaddq_u16(x_hi, vshrq_n_u16(x_hi, 8));
    x_lo = vshrq_n_u16(x_lo, 8);
    x_hi = vshrq_n_u16(x_hi, 8);

    // Combine the results and return
    return vcombine_u8(vmovn_u16(x_lo), vmovn_u16(x_hi));
 }


 // Snippet demonstrating plugging this in to BlitNtoNPixelAlpha, very buggy impl at this time
 #ifdef SDL_NEON_INTRINSICS
    if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasNEON()) {
        uint8x8_t shuffle_mask = generate_vtable(srcfmt, dstfmt);
        uint8x8_t alpha_mask = get_alpha_mask(dstfmt);
        int chunks = width / 4;
        while (height--) {
            for (int i = 0; i < chunks; i += 1) {
                uint8x16_t colors = vld1q_u8(src + i * 16);
                colors = reorder_pixels_argb8888_to_dstfmt(colors, shuffle_mask);
                uint8x16_t dst_colors = vld1q_u8(dst + i * 16);
                uint8x16_t alpha_splat = splat_alpha_to_new_vector(colors, alpha_mask);
                uint8x16_t mixed_colors = MixRGBA_NEON(colors, dst_colors, alpha_splat);
                vst1q_u8(dst + i * 16, mixed_colors);
            }

            // Handle remaining pixels when width is not a multiple of 4
            if (width % 4 != 0) {
                int remaining_pixels = width % 4;
                int offset = width - remaining_pixels;
                for (int i = offset; i < width; i++) {
                    DISEMBLE_RGBA(src + i * 4, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
                    if (sA) {
                        DISEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
                        ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
                        ASSEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, dR, dG, dB, dA);
                    }
                }
            }

            src += 4 * width;
            dst += 4 * width;

            src += srcskip;
            dst += dstskip;
        }
        return;
    }
 #endif
	#include <arm_neon.h>
	#include <SDL.h>

	/**
	* NEON operates at up to 128 bit width and supports mostly the same operations BlitNtoNPixelAlpha_SSE4_1 and _AVX2 use
	* save for shuffling.
	* We have several problems to solve, some easier than others. We need to reorder the input src pixel stream efficiently.
	* At 128 bits this is 4-wide, similar to SSE4.1. We can assemble lookup tables that NEON will use to place the data
	* appropriately into a new vector, instead of shuffling in-place.
	* We need to use a similar approach to extract alpha dynamically to splat it as the extract functions want immediate
	* integers AFAICT and switches for that against the SDL_PixelFormat->Ashift struct member is a performance nightmare.
	* From there MixRGBA_NEON is nearly the same as the SSE4.1 intrinsic implementation. Missing from this code is the
	* required step where alpha is set to 0xFF in all src pixel input and only preserved in the input mask for channel multiply.
	*
	* Benchmarks indicate a jump from ~20 FPS in the optimized scalar form to over 90 FPS on an M1 Max under the latest macOS
	* and Apple Clang using `-O3`.
	*/

	// vtable fetches alpha from input pixels based on format for splat
	uint8x8_t get_alpha_vtable(SDL_PixelFormat* dstfmt) {
	// Create a table lookup pattern to extract the alpha values
	uint8_t pattern_data[8] = {0, 0, 0, 0, 0, 0, 0, 0};
	pattern_data[dstfmt->Ashift / 8] = 1;
	return vld1_u8(pattern_data);
	}

	// uses above vtable to pull alpha into vector before splat
	uint8x16_t extract_alpha_values(uint8x16_t input_colors, uint8x8_t pattern) {
	// Extract the alpha values using the table lookup pattern
	uint8x8_t low_alpha = vtbl1_u8(vget_low_u8(input_colors), pattern);
	uint8x8_t high_alpha = vtbl1_u8(vget_high_u8(input_colors), pattern);
	uint8x16_t extracted_alpha = vcombine_u8(low_alpha, high_alpha);

	return extracted_alpha;
	}

	// splats alpha for MixRGBA_NEON input
	uint8x16_t splat_alpha_to_new_vector(uint8x16_t reordered_colors, uint8x8_t alpha_mask) {
	// Extract the alpha values from the reordered colors
	uint8x16_t alpha_values = extract_alpha_values(reordered_colors, alpha_mask);

	// Duplicate the alpha values across all bytes of each 32-bit integer
	uint8x8_t low_alpha = vget_low_u8(alpha_values);
	uint8x8_t high_alpha = vget_high_u8(alpha_values);
	uint8x16_t splatted_alpha = vcombine_u8(low_alpha, high_alpha);

	return splatted_alpha;
	}

	// Helper
	uint8x8x2_t convert_uint8x16_to_uint8x8x2(uint8x16_t input) {
	uint8x8x2_t output;
	output.val[0] = vget_low_u8(input);
	output.val[1] = vget_high_u8(input);
	return output;
	}

	// Produce a vtable defining the color reorder operation
	uint8x8_t generate_reorder_vtable(const SDL_PixelFormat srcfmt, const SDL_PixelFormat dstfmt) {
	uint8_t shuffle_mask[8];
	for (int i = 0; i < 2; ++i) {
	shuffle_mask[dstfmt->Ashift / 8 + i * 4] = srcfmt->Ashift / 8 + i * 4;
	shuffle_mask[dstfmt->Rshift / 8 + i * 4] = srcfmt->Rshift / 8 + i * 4;
	shuffle_mask[dstfmt->Gshift / 8 + i * 4] = srcfmt->Gshift / 8 + i * 4;
	shuffle_mask[dstfmt->Bshift / 8 + i * 4] = srcfmt->Bshift / 8 + i * 4;
	}
	return vld1_u8(shuffle_mask);
	}

	// Reorder the pixels such that the source pixel stream is aligned to a destination format implied by the pattern
	uint8x16_t reorder_pixels_argb8888_to_dstfmt(const uint8x16_t src_pixels, const uint8x8_t pattern) {
	// Load four src pixels into two 64-bit NEON registers
	uint8x8x2_t src_data = convert_uint8x16_to_uint8x8x2(src_pixels);

	// Apply the vtable lookup function on the two registers to generate dstfmt ordered pixel data
	uint8x8_t low = vtbl1_u8(src_data.val[0], pattern);
	uint8x8_t high = vtbl1_u8(src_data.val[1], pattern);

	// Pack both results back into a 128-bit NEON register and return it
	return vcombine_u8(low, high);
	}

	// Blend four pixels of 32 ARGB data -- missing alpha saturate step
	uint8x16_t MixRGBA_NEON(uint8x16_t sC, uint8x16_t dC, uint8x16_t sA) {
	// Calculate (sC - dC) * sA
	uint16x8_t diff_lo = vmull_u8(vget_low_u8(sC), vget_low_u8(sA));
	uint16x8_t diff_hi = vmull_u8(vget_high_u8(sC), vget_high_u8(sA));
	diff_lo = vsubq_u16(diff_lo, vmull_u8(vget_low_u8(dC), vget_low_u8(sA)));
	diff_hi = vsubq_u16(diff_hi, vmull_u8(vget_high_u8(dC), vget_high_u8(sA)));

	// Calculate (dC << 8) - dC
	uint16x8_t dC_lo = vmovl_u8(vget_low_u8(dC));
	uint16x8_t dC_hi = vmovl_u8(vget_high_u8(dC));
	dC_lo = vsubq_u16(vshlq_n_u16(dC_lo, 8), dC_lo);
	dC_hi = vsubq_u16(vshlq_n_u16(dC_hi, 8), dC_hi);

	// Add the two results and the constant 0x1
	uint16x8_t x_lo = vaddq_u16(vaddq_u16(diff_lo, dC_lo), vdupq_n_u16(0x1));
	uint16x8_t x_hi = vaddq_u16(vaddq_u16(diff_hi, dC_hi), vdupq_n_u16(0x1));

	// Add x >> 8 to x and then shift the result right by 8 bits
	x_lo = vaddq_u16(x_lo, vshrq_n_u16(x_lo, 8));
	x_hi = vaddq_u16(x_hi, vshrq_n_u16(x_hi, 8));
	x_lo = vshrq_n_u16(x_lo, 8);
	x_hi = vshrq_n_u16(x_hi, 8);

	// Combine the results and return
	return vcombine_u8(vmovn_u16(x_lo), vmovn_u16(x_hi));
	}


	// Snippet demonstrating plugging this in to BlitNtoNPixelAlpha, very buggy impl at this time
	#ifdef SDL_NEON_INTRINSICS
	if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasNEON()) {
	uint8x8_t shuffle_mask = generate_vtable(srcfmt, dstfmt);
	uint8x8_t alpha_mask = get_alpha_mask(dstfmt);
	int chunks = width / 4;
	while (height--) {
	for (int i = 0; i < chunks; i += 1) {
	uint8x16_t colors = vld1q_u8(src + i * 16);
	colors = reorder_pixels_argb8888_to_dstfmt(colors, shuffle_mask);
	uint8x16_t dst_colors = vld1q_u8(dst + i * 16);
	uint8x16_t alpha_splat = splat_alpha_to_new_vector(colors, alpha_mask);
	uint8x16_t mixed_colors = MixRGBA_NEON(colors, dst_colors, alpha_splat);
	vst1q_u8(dst + i * 16, mixed_colors);
	}

	// Handle remaining pixels when width is not a multiple of 4
	if (width % 4 != 0) {
	int remaining_pixels = width % 4;
	int offset = width - remaining_pixels;
	for (int i = offset; i < width; i++) {
	DISEMBLE_RGBA(src + i * 4, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
	if (sA) {
	DISEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
	ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
	ASSEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, dR, dG, dB, dA);
	}
	}
	}

	src += 4 * width;
	dst += 4 * width;

	src += srcskip;
	dst += dstskip;
	}
	return;
	}
	#endif