Last active
December 1, 2022 00:24
-
-
Save addaleax/5e26f220df28073254e44ad78ab4520a to your computer and use it in GitHub Desktop.
UTF-8 to ISO-8859-1 [aka Latin1] converters with gcc-style SIMD C++
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <emmintrin.h> | |
#include <immintrin.h> | |
// header | |
extern "C" { | |
// Returns true if str can be converted to latin1. | |
bool utf8_can_be_converted_to_latin1(const uint8_t* str, size_t len); | |
// Modifies str in place! Only safe if utf8_can_be_converted_to_latin1() has returned true before. | |
size_t utf8_convert_to_latin1(uint8_t* str, size_t len); | |
} | |
// source | |
static bool utf8_can_be_converted_to_latin1_nosimd(const uint8_t* str, size_t len) { | |
for (size_t i = 0; i < len; i++) { | |
if (str[i] > 0xC3) return false; | |
} | |
return true; | |
} | |
__attribute__((target("sse2"))) | |
static bool utf8_can_be_converted_to_latin1_sse2(const uint8_t* str, size_t len) { | |
static const __m128i mask = _mm_set1_epi8(0xC3); | |
for (size_t i = 0; i < len;) { | |
if (len - i >= 16) { | |
__m128i si128 = _mm_loadu_si128((const __m128i*)(str + i)); | |
si128 = _mm_cmpgt_epi8(mask, si128); | |
if (_mm_movemask_epi8(si128)) return false; | |
i += 16; | |
} else return utf8_can_be_converted_to_latin1_nosimd(str + i, len - i); | |
} | |
return true; | |
} | |
__attribute__((target("avx2"))) | |
static bool utf8_can_be_converted_to_latin1_avx2(const uint8_t* str, size_t len) { | |
static const __m256i mask = _mm256_set1_epi8(0xC3); | |
for (size_t i = 0; i < len;) { | |
if (len - i >= 32) { | |
__m256i si256 = _mm256_loadu_si256((const __m256i*)(str + i)); | |
si256 = _mm256_cmpgt_epi8(mask, si256); | |
if (_mm256_movemask_epi8(si256)) return false; | |
i += 32; | |
} else return utf8_can_be_converted_to_latin1_sse2(str + i, len - i); | |
} | |
return true; | |
} | |
static size_t utf8_convert_to_latin1_nosimd(uint8_t* str, size_t len) { | |
uint8_t* read = str,* write = str; | |
for (; read < str + len; ) { | |
#define UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD \ | |
if ((*read & 0xC0) == 0xC0 && read + 1 < str + len) { \ | |
*write = ((read[0] & 0x1F) << 6) | (read[1] & 0x3F); \ | |
read += 2; \ | |
} else { \ | |
read++; \ | |
} \ | |
write++; | |
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD | |
} | |
return write - str; | |
} | |
static size_t utf8_convert_to_latin1_sse2(uint8_t* str, size_t len) { | |
uint8_t* read = str,* write = str; | |
for (; read < str + len; ) { | |
if (read + 16 < str + len) { | |
if (!_mm_movemask_epi8(_mm_loadu_si128((const __m128i*)read))) { | |
read += 16; | |
write += 16; | |
continue; | |
} | |
} | |
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD | |
} | |
return write - str; | |
} | |
static size_t utf8_convert_to_latin1_avx2(uint8_t* str, size_t len) { | |
uint8_t* read = str,* write = str; | |
for (; read < str + len; ) { | |
if (read + 32 < str + len) { | |
if (!_mm256_movemask_epi8(_mm256_loadu_si256((const __m256i*)read))) { | |
read += 32; | |
write += 32; | |
continue; | |
} | |
} | |
if (read + 16 < str + len) { | |
if (!_mm_movemask_epi8(_mm_loadu_si128((const __m128i*)read))) { | |
read += 16; | |
write += 16; | |
continue; | |
} | |
} | |
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD | |
} | |
return write - str; | |
} | |
extern "C" { | |
bool(* resolve_can_convert())(const uint8_t*, size_t) { | |
__builtin_cpu_init (); | |
if (__builtin_cpu_supports ("avx2")) | |
return utf8_can_be_converted_to_latin1_avx2; | |
else if (__builtin_cpu_supports ("sse2")) | |
return utf8_can_be_converted_to_latin1_sse2; | |
else | |
return utf8_can_be_converted_to_latin1_nosimd; | |
} | |
size_t(* resolve_convert())(uint8_t*, size_t) { | |
__builtin_cpu_init (); | |
if (__builtin_cpu_supports ("avx2")) | |
return utf8_convert_to_latin1_avx2; | |
else if (__builtin_cpu_supports ("sse2")) | |
return utf8_convert_to_latin1_sse2; | |
else | |
return utf8_convert_to_latin1_nosimd; | |
} | |
bool utf8_can_be_converted_to_latin1(const uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_can_convert"))); | |
size_t utf8_convert_to_latin1(uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_convert"))); | |
} | |
// test | |
#if 1 | |
#include <assert.h> | |
#include <string> | |
int main() { | |
auto can_convert = [](const char* ustr) { | |
return utf8_can_be_converted_to_latin1((uint8_t*)ustr, strlen(ustr)); | |
}; | |
assert(can_convert("") == true); | |
assert(can_convert("hello") == true); | |
assert(can_convert("hellooooooooooooooooooooooooooooooooo") == true); | |
assert(can_convert("hellö") == true); | |
assert(can_convert("hellā") == false); | |
assert(can_convert("hellāaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == false); | |
assert(can_convert("hellaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaā") == false); | |
auto convert = [](const char* ustr) { | |
std::string copy(ustr); | |
size_t newlen = utf8_convert_to_latin1((uint8_t*)©[0], copy.size()); | |
copy.resize(newlen); | |
return copy; | |
}; | |
assert(convert("hello") == "hello"); | |
assert(convert("hellö") == "hell\xf6"); | |
assert(convert("hellllllllllllllllllllllllllllllllllllllö") == "hellllllllllllllllllllllllllllllllllllll\xf6"); | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment