Created
December 5, 2022 00:02
-
-
Save addaleax/601f5380a53257b80f373ba53376181f to your computer and use it in GitHub Desktop.
ISO-8859-1 to UTF-8 byte length counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <emmintrin.h> | |
#include <immintrin.h> | |
// header | |
extern "C" { | |
size_t utf8_length_for_latin1_str(const uint8_t* latin1_str, size_t latin1_len); | |
} | |
// source | |
static size_t utf8_length_for_latin1_str_nosimd(const uint8_t* str, size_t len) { | |
size_t ret = len; | |
for (size_t i = 0; i < len; i++) { | |
ret += str[i] >> 7; | |
} | |
return ret; | |
} | |
__attribute__((target("sse2"))) | |
static size_t utf8_length_for_latin1_str_sse2(const uint8_t* str, size_t len) { | |
size_t ret = len - (len % 16); | |
size_t i; | |
for (i = 0; len - i >= 16; i += 16) { | |
__m128i si128 = _mm_loadu_si128((const __m128i*)(str + i)); | |
ret += __builtin_popcount(_mm_movemask_epi8(si128)); | |
} | |
return ret + utf8_length_for_latin1_str_nosimd(str + i, len - i); | |
} | |
__attribute__((target("avx2"))) | |
static size_t utf8_length_for_latin1_str_avx2(const uint8_t* str, size_t len) { | |
size_t ret = len - (len % 32); | |
size_t i; | |
for (i = 0; len - i >= 32; i += 32) { | |
__m256i si256 = _mm256_loadu_si256((const __m256i*)(str + i)); | |
ret += __builtin_popcount(_mm256_movemask_epi8(si256)); | |
} | |
return ret + utf8_length_for_latin1_str_sse2(str + i, len - i); | |
} | |
extern "C" { | |
size_t(* resolve_utf8_length_for_latin1_str())(const uint8_t*, size_t) { | |
__builtin_cpu_init (); | |
if (__builtin_cpu_supports ("avx2")) | |
return utf8_length_for_latin1_str_avx2; | |
else if (__builtin_cpu_supports ("sse2")) | |
return utf8_length_for_latin1_str_sse2; | |
else | |
return utf8_length_for_latin1_str_nosimd; | |
} | |
size_t utf8_length_for_latin1_str(const uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_utf8_length_for_latin1_str"))); | |
} | |
// test | |
#if 1 | |
#include <assert.h> | |
int main() { | |
auto get_length = [](const char* ustr) { | |
return utf8_length_for_latin1_str((uint8_t*)ustr, strlen(ustr)); | |
}; | |
assert(get_length("") == 0); | |
assert(get_length("hello") == 5); | |
assert(get_length("hellooooooooooooooooooooooooooooooooo") == 37); | |
assert(get_length("hell\xf6") == 6); | |
assert(get_length("hell\xf6" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 36); | |
assert(get_length("hellaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xf6") == 37); | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment