Last active
March 3, 2025 19:48
-
-
Save Raimo33/3ad6a9400a87eb1a21876759382f7312 to your computer and use it in GitHub Desktop.
SIMD strtolower
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//TODO find a way to make them const, forcing prevention of thread safety issues, constexpr?? | |
#ifdef __AVX512F__ | |
static __m512i _512_vec_A_minus_1; | |
static __m512i _512_vec_case_range; | |
static __m512i _512_add_mask; | |
#endif | |
#ifdef __AVX2__ | |
static __m256i _256_vec_A_minus_1; | |
static __m256i _256_vec_case_range; | |
static __m256i _256_add_mask; | |
#endif | |
#ifdef __SSE2__ | |
static __m128i _128_vec_A_minus_1; | |
static __m128i _128_vec_case_range; | |
static __m128i _128_add_mask; | |
#endif | |
CONSTRUCTOR void http_deserializer_init(void) | |
{ | |
#ifdef __AVX512F__ | |
_512_vec_A_minus_1 = _mm512_set1_epi8('A' - 1); | |
_512_vec_case_range = _mm512_set1_epi8('Z' - 'A' + 1); | |
_512_add_mask = _mm512_set1_epi8('a' - 'A'); | |
#endif | |
#ifdef __AVX2__ | |
_256_vec_A_minus_1 = _mm256_set1_epi8('A' - 1); | |
_256_vec_case_range = _mm256_set1_epi8('Z' - 'A' + 1); | |
_256_add_mask = _mm256_set1_epi8('a' - 'A'); | |
#endif | |
#ifdef __SSE2__ | |
_128_vec_A_minus_1 = _mm_set1_epi8('A' - 1); | |
_128_vec_case_range = _mm_set1_epi8('Z' - 'A' + 1); | |
_128_add_mask = _mm_set1_epi8('a' - 'A'); | |
#endif | |
} | |
static void strtolower(char *str, uint16_t len) | |
{ | |
uint8_t misaligned_bytes = align_forward(str); | |
misaligned_bytes -= (misaligned_bytes > len) * (misaligned_bytes - len); | |
while (UNLIKELY(misaligned_bytes--)) | |
{ | |
const char c = *str; | |
*str++ = c | (((uint8_t)(c - 'A') <= ('Z' - 'A')) << 5); | |
len--; | |
} | |
#ifdef __AVX512F__ | |
while (LIKELY(len >= 64)) | |
{ | |
__m512i chunk = _mm512_load_si512((__m512i *)str); | |
const __m512i shifted = _mm512_xor_si512(chunk, _512_vec_A_minus_1); | |
const __mmask64 cmp_mask = _mm512_cmple_epi8_mask(shifted, _512_vec_case_range); | |
const __m512i add_mask = _mm512_maskz_mov_epi8(cmp_mask, _512_add_mask); | |
chunk = _mm512_add_epi8(chunk, add_mask); | |
_mm512_stream_si512((__m512i *)str, chunk); | |
str += 64; | |
len -= 64; | |
} | |
#endif | |
#ifdef __AVX2__ | |
while (LIKELY(len >= 32)) | |
{ | |
__m256i chunk = _mm256_load_si256((__m256i *)str); | |
const __m256i shifted = _mm256_xor_si256(chunk, _256_vec_A_minus_1); | |
const __m256i cmp_mask = _mm256_cmpgt_epi8(_256_vec_case_range, shifted); | |
const __m256i add_mask = _mm256_and_si256(cmp_mask, _256_add_mask); | |
chunk = _mm256_add_epi8(chunk, add_mask); | |
_mm256_stream_si256((__m256i *)str, chunk); | |
str += 32; | |
len -= 32; | |
} | |
#endif | |
#ifdef __SSE2__ | |
while (LIKELY(len >= 16)) | |
{ | |
__m128i chunk = _mm_load_si128((__m128i *)str); | |
const __m128i shifted = _mm_xor_si128(chunk, _128_vec_A_minus_1); | |
const __m128i cmp_mask = _mm_cmpgt_epi8(_128_vec_case_range, shifted); | |
const __m128i add_mask = _mm_and_si128(cmp_mask, _128_add_mask); | |
chunk = _mm_add_epi8(chunk, add_mask); | |
_mm_stream_si128((__m128i *)str, chunk); | |
str += 16; | |
len -= 16; | |
} | |
#endif | |
constexpr uint64_t all_bytes = 0x0101010101010101ULL; | |
while (LIKELY(len >= 8)) | |
{ | |
const uint64_t octets = *(uint64_t *)str; | |
const uint64_t heptets = octets & (0x7F * all_bytes); | |
const uint64_t is_gt_Z = heptets + (0x7F - 'Z') * all_bytes; | |
const uint64_t is_ge_A = heptets + (0x80 - 'A') * all_bytes; | |
const uint64_t is_ascii = ~octets & (0x80 * all_bytes); | |
const uint64_t is_upper = is_ascii & (is_ge_A ^ is_gt_Z); | |
*(uint64_t *)str = octets | (is_upper >> 2); | |
str += 8; | |
len -= 8; | |
} | |
while (LIKELY(len--)) | |
{ | |
const char c = *str; | |
*str++ = c | (((uint8_t)(c - 'A') <= ('Z' - 'A')) << 5); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment