Skip to content

Instantly share code, notes, and snippets.

@Raimo33
Last active March 3, 2025 19:48
Show Gist options
  • Save Raimo33/3ad6a9400a87eb1a21876759382f7312 to your computer and use it in GitHub Desktop.
Save Raimo33/3ad6a9400a87eb1a21876759382f7312 to your computer and use it in GitHub Desktop.
SIMD strtolower
//TODO find a way to make them const, forcing prevention of thread safety issues, constexpr??
#ifdef __AVX512F__
static __m512i _512_vec_A_minus_1;
static __m512i _512_vec_case_range;
static __m512i _512_add_mask;
#endif
#ifdef __AVX2__
static __m256i _256_vec_A_minus_1;
static __m256i _256_vec_case_range;
static __m256i _256_add_mask;
#endif
#ifdef __SSE2__
static __m128i _128_vec_A_minus_1;
static __m128i _128_vec_case_range;
static __m128i _128_add_mask;
#endif
CONSTRUCTOR void http_deserializer_init(void)
{
#ifdef __AVX512F__
_512_vec_A_minus_1 = _mm512_set1_epi8('A' - 1);
_512_vec_case_range = _mm512_set1_epi8('Z' - 'A' + 1);
_512_add_mask = _mm512_set1_epi8('a' - 'A');
#endif
#ifdef __AVX2__
_256_vec_A_minus_1 = _mm256_set1_epi8('A' - 1);
_256_vec_case_range = _mm256_set1_epi8('Z' - 'A' + 1);
_256_add_mask = _mm256_set1_epi8('a' - 'A');
#endif
#ifdef __SSE2__
_128_vec_A_minus_1 = _mm_set1_epi8('A' - 1);
_128_vec_case_range = _mm_set1_epi8('Z' - 'A' + 1);
_128_add_mask = _mm_set1_epi8('a' - 'A');
#endif
}
static void strtolower(char *str, uint16_t len)
{
uint8_t misaligned_bytes = align_forward(str);
misaligned_bytes -= (misaligned_bytes > len) * (misaligned_bytes - len);
while (UNLIKELY(misaligned_bytes--))
{
const char c = *str;
*str++ = c | (((uint8_t)(c - 'A') <= ('Z' - 'A')) << 5);
len--;
}
#ifdef __AVX512F__
while (LIKELY(len >= 64))
{
__m512i chunk = _mm512_load_si512((__m512i *)str);
const __m512i shifted = _mm512_xor_si512(chunk, _512_vec_A_minus_1);
const __mmask64 cmp_mask = _mm512_cmple_epi8_mask(shifted, _512_vec_case_range);
const __m512i add_mask = _mm512_maskz_mov_epi8(cmp_mask, _512_add_mask);
chunk = _mm512_add_epi8(chunk, add_mask);
_mm512_stream_si512((__m512i *)str, chunk);
str += 64;
len -= 64;
}
#endif
#ifdef __AVX2__
while (LIKELY(len >= 32))
{
__m256i chunk = _mm256_load_si256((__m256i *)str);
const __m256i shifted = _mm256_xor_si256(chunk, _256_vec_A_minus_1);
const __m256i cmp_mask = _mm256_cmpgt_epi8(_256_vec_case_range, shifted);
const __m256i add_mask = _mm256_and_si256(cmp_mask, _256_add_mask);
chunk = _mm256_add_epi8(chunk, add_mask);
_mm256_stream_si256((__m256i *)str, chunk);
str += 32;
len -= 32;
}
#endif
#ifdef __SSE2__
while (LIKELY(len >= 16))
{
__m128i chunk = _mm_load_si128((__m128i *)str);
const __m128i shifted = _mm_xor_si128(chunk, _128_vec_A_minus_1);
const __m128i cmp_mask = _mm_cmpgt_epi8(_128_vec_case_range, shifted);
const __m128i add_mask = _mm_and_si128(cmp_mask, _128_add_mask);
chunk = _mm_add_epi8(chunk, add_mask);
_mm_stream_si128((__m128i *)str, chunk);
str += 16;
len -= 16;
}
#endif
constexpr uint64_t all_bytes = 0x0101010101010101ULL;
while (LIKELY(len >= 8))
{
const uint64_t octets = *(uint64_t *)str;
const uint64_t heptets = octets & (0x7F * all_bytes);
const uint64_t is_gt_Z = heptets + (0x7F - 'Z') * all_bytes;
const uint64_t is_ge_A = heptets + (0x80 - 'A') * all_bytes;
const uint64_t is_ascii = ~octets & (0x80 * all_bytes);
const uint64_t is_upper = is_ascii & (is_ge_A ^ is_gt_Z);
*(uint64_t *)str = octets | (is_upper >> 2);
str += 8;
len -= 8;
}
while (LIKELY(len--))
{
const char c = *str;
*str++ = c | (((uint8_t)(c - 'A') <= ('Z' - 'A')) << 5);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment