Skip to content

Instantly share code, notes, and snippets.

@aconz2
Created May 1, 2026 16:58
Show Gist options
  • Select an option

  • Save aconz2/5a1bd36ce34afeeb5479f6ce775bb3d7 to your computer and use it in GitHub Desktop.

Select an option

Save aconz2/5a1bd36ce34afeeb5479f6ce775bb3d7 to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <emmintrin.h>
struct array_container_t {
const uint16_t *array;
size_t cardinality;
};
__attribute__((always_inline)) inline bool simd_quad_me2(const array_container_t *arr, uint16_t pos) {
constexpr int32_t gap = 64;
const uint16_t *carr = arr->array;
int32_t cardinality = arr->cardinality;
if (cardinality < 8) {
for (int32_t j = 0; j < cardinality; j++) {
if (carr[j] == pos) return true;
}
return false;
}
if (cardinality < gap) {
__m128i needle = _mm_set1_epi16((short)pos);
int32_t j = 0;
for (; j + 16 <= cardinality; j += 16) {
__m128i v0 = _mm_loadu_si128((const __m128i *)(carr + j));
__m128i v1 = _mm_loadu_si128((const __m128i *)(carr + j + 8));
if (_mm_movemask_epi8(_mm_or_si128(_mm_cmpeq_epi16(v0, needle), _mm_cmpeq_epi16(v1, needle))) != 0) return true;
}
for (; j + 8 <= cardinality; j += 8) {
__m128i v0 = _mm_loadu_si128((const __m128i *)(carr + j));
if (_mm_movemask_epi8(_mm_cmpeq_epi16(v0, needle)) != 0) return true;
}
for (; j < cardinality; j++) {
if (carr[j] == pos) return true;
}
return false;
}
int32_t num_blocks = cardinality / gap;
int32_t base = 0;
int32_t n = num_blocks;
while (n > 3) {
int32_t quarter = n >> 2;
int32_t k1 = carr[(base + quarter + 1) * gap - 1];
int32_t k2 = carr[(base + 2 * quarter + 1) * gap - 1];
int32_t k3 = carr[(base + 3 * quarter + 1) * gap - 1];
int32_t c1 = (k1 < pos);
int32_t c2 = (k2 < pos);
int32_t c3 = (k3 < pos);
base += (c1 + c2 + c3) * quarter;
n -= 3 * quarter;
}
while (n > 1) {
int32_t half = n >> 1;
base = (carr[(base + half + 1) * gap - 1] < pos) ? base + half : base;
n -= half;
}
int32_t lo = (carr[(base + 1) * gap - 1] < pos) ? base + 1 : base;
if (lo < num_blocks) {
const uint16_t *blk = carr + lo * gap;
#ifdef __ARM_NEON
// TODO
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8_t v0 = vld1q_u16(blk);
uint16x8_t v1 = vld1q_u16(blk + 8);
uint16x8_t hit = vorrq_u16(vceqq_u16(v0, needle), vceqq_u16(v1, needle));
return vmaxvq_u16(hit) != 0;
#else
__m128i needle = _mm_set1_epi16((short)pos);
__m128i v0 = _mm_loadu_si128((const __m128i *)blk);
__m128i v1 = _mm_loadu_si128((const __m128i *)(blk + 8));
__m128i v2 = _mm_loadu_si128((const __m128i *)(blk + 16));
__m128i v3 = _mm_loadu_si128((const __m128i *)(blk + 24));
__m128i v4 = _mm_loadu_si128((const __m128i *)(blk + 32));
__m128i v5 = _mm_loadu_si128((const __m128i *)(blk + 40));
__m128i v6 = _mm_loadu_si128((const __m128i *)(blk + 48));
__m128i v7 = _mm_loadu_si128((const __m128i *)(blk + 56));
__m128i hit0 = _mm_or_si128(_mm_cmpeq_epi16(v0, needle),
_mm_cmpeq_epi16(v1, needle));
__m128i hit1 = _mm_or_si128(_mm_cmpeq_epi16(v2, needle),
_mm_cmpeq_epi16(v3, needle));
__m128i hit2 = _mm_or_si128(_mm_cmpeq_epi16(v4, needle),
_mm_cmpeq_epi16(v5, needle));
__m128i hit3 = _mm_or_si128(_mm_cmpeq_epi16(v6, needle),
_mm_cmpeq_epi16(v7, needle));
__m128i hit = _mm_or_si128(_mm_or_si128(hit0, hit1), _mm_or_si128(hit2, hit3));
return _mm_movemask_epi8(hit) != 0;
#endif
}
for (int32_t j = num_blocks * gap; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return (v == pos);
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment