Created
July 25, 2018 21:15
-
-
Save BillyONeal/72dcde394758d4f9d82324774b8107e4 to your computer and use it in GitHub Desktop.
Unicode CVT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <algorithm> | |
#include <chrono> | |
#include <codecvt> | |
#include <locale> | |
#include <stdexcept> | |
#include <string> | |
#include <intrin.h> | |
#include <smmintrin.h> | |
#include <benchmark/benchmark.h> | |
#include "file.hpp" | |
static_assert(sizeof(char16_t) == sizeof(wchar_t), "BOOM"); | |
using utf16char = wchar_t; | |
using utf16string = std::basic_string<wchar_t>; | |
__declspec(noinline) utf16string std_wstring_convert_to_utf16(const std::string& src) { | |
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion; | |
return conversion.from_bytes(src); | |
} | |
__declspec(noinline) std::string std_wstring_convert_to_utf8(const utf16string& src) { | |
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion; | |
return conversion.to_bytes(src); | |
} | |
#define LOW_3BITS 0x7 | |
#define LOW_4BITS 0xF | |
#define LOW_5BITS 0x1F | |
#define LOW_6BITS 0x3F | |
#define HI_2_BITS 0xC0 | |
#define BIT4 0x8 | |
#define BIT5 0x10 | |
#define BIT6 0x20 | |
#define BIT7 0x40 | |
#define BIT8 0x80 | |
#define L_SURROGATE_START 0xDC00 | |
#define L_SURROGATE_END 0xDFFF | |
#define H_SURROGATE_START 0xD800 | |
#define H_SURROGATE_END 0xDBFF | |
#define SURROGATE_PAIR_START 0x10000 | |
__declspec(noinline) utf16string casablanca_to_utf16(const std::string &s) | |
{ | |
utf16string dest; | |
// Save repeated heap allocations, use less than source string size assuming some | |
// of the characters are not just ASCII and collapse. | |
dest.reserve(static_cast<size_t>(static_cast<double>(s.size()) * .70)); | |
for (auto src = s.begin(); src != s.end(); ++src) | |
{ | |
if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F | |
{ | |
dest.push_back(utf16string::value_type(*src)); | |
} | |
else | |
{ | |
unsigned char numContBytes = 0; | |
uint32_t codePoint; | |
if ((*src & BIT7) == 0) | |
{ | |
throw std::range_error("UTF-8 string character can never start with 10xxxxxx"); | |
} | |
else if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF | |
{ | |
codePoint = *src & LOW_5BITS; | |
numContBytes = 1; | |
} | |
else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF | |
{ | |
codePoint = *src & LOW_4BITS; | |
numContBytes = 2; | |
} | |
else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF | |
{ | |
codePoint = *src & LOW_3BITS; | |
numContBytes = 3; | |
} | |
else | |
{ | |
throw std::range_error("UTF-8 string has invalid Unicode code point"); | |
} | |
for (unsigned char i = 0; i < numContBytes; ++i) | |
{ | |
if (++src == s.end()) | |
{ | |
throw std::range_error("UTF-8 string is missing bytes in character"); | |
} | |
if ((*src & BIT8) == 0 || (*src & BIT7) != 0) | |
{ | |
throw std::range_error("UTF-8 continuation byte is missing leading byte"); | |
} | |
codePoint <<= 6; | |
codePoint |= *src & LOW_6BITS; | |
} | |
if (codePoint >= SURROGATE_PAIR_START) | |
{ | |
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs. | |
// - 0x10000 is subtracted from the code point | |
// - high surrogate is 0xD800 added to the top ten bits | |
// - low surrogate is 0xDC00 added to the low ten bits | |
codePoint -= SURROGATE_PAIR_START; | |
dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START)); | |
dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START)); | |
} | |
else | |
{ | |
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value. | |
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode | |
// them if encountered. | |
dest.push_back(utf16string::value_type(codePoint)); | |
} | |
} | |
} | |
return dest; | |
} | |
__declspec(noinline) std::string casablanca_to_utf8(const utf16string &w) | |
{ | |
std::string dest; | |
dest.reserve(w.size()); | |
for (auto src = w.begin(); src != w.end(); ++src) | |
{ | |
// Check for high surrogate. | |
if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END) | |
{ | |
const auto highSurrogate = *src++; | |
if (src == w.end()) | |
{ | |
throw std::range_error("UTF-16 string is missing low surrogate"); | |
} | |
const auto lowSurrogate = *src; | |
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) | |
{ | |
throw std::range_error("UTF-16 string has invalid low surrogate"); | |
} | |
// To get from surrogate pair to Unicode code point: | |
// - subract 0xD800 from high surrogate, this forms top ten bits | |
// - subract 0xDC00 from low surrogate, this forms low ten bits | |
// - add 0x10000 | |
// Leaves a code point in U+10000 to U+10FFFF range. | |
uint32_t codePoint = highSurrogate - H_SURROGATE_START; | |
codePoint <<= 10; | |
codePoint |= lowSurrogate - L_SURROGATE_START; | |
codePoint += SURROGATE_PAIR_START; | |
// 4 bytes need using 21 bits | |
dest.push_back(char((codePoint >> 18) | 0xF0)); // leading 3 bits | |
dest.push_back(char(((codePoint >> 12) & LOW_6BITS) | BIT8)); // next 6 bits | |
dest.push_back(char(((codePoint >> 6) & LOW_6BITS) | BIT8)); // next 6 bits | |
dest.push_back(char((codePoint & LOW_6BITS) | BIT8)); // trailing 6 bits | |
} | |
else | |
{ | |
if (*src <= 0x7F) // single byte character | |
{ | |
dest.push_back(static_cast<char>(*src)); | |
} | |
else if (*src <= 0x7FF) // 2 bytes needed (11 bits used) | |
{ | |
dest.push_back(char((*src >> 6) | 0xC0)); // leading 5 bits | |
dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits | |
} | |
else // 3 bytes needed (16 bits used) | |
{ | |
dest.push_back(char((*src >> 12) | 0xE0)); // leading 4 bits | |
dest.push_back(char(((*src >> 6) & LOW_6BITS) | BIT8)); // middle 6 bits | |
dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits | |
} | |
} | |
} | |
return dest; | |
} | |
static_assert(sizeof(size_t) == 4 || sizeof(size_t) == 8, "This code assumes 32 bit or 64 bit platform"); | |
constexpr size_t allAsciiInUtf8Mask{ sizeof(size_t) == 4 ? 0x80808080u : 0x8080808080808080u }; | |
constexpr size_t allAsciiInUtf16Mask{ sizeof(size_t) == 4 ? 0xFF80FF80u : 0xFF80FF80FF80FF80u }; | |
inline size_t sse2_count_utf8_to_utf16(const std::string& s) | |
{ | |
const size_t sSize = s.size(); | |
const char* const sData = s.data(); | |
size_t result{sSize}; // only pay to change this value if non-ASCII values are seen | |
for (size_t index = 0; index < sSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(sData + index) & 7) == 0) | |
{ // we're aligned, try SIMD | |
const char * const basisInput = sData + index; | |
const size_t maxLoop = (sSize - index) / 8; | |
size_t thisLoop = 0; | |
for (; thisLoop < maxLoop; ++thisLoop) | |
{ | |
const __m128i input = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(basisInput + thisLoop * 8)); | |
if (_mm_movemask_epi8(input)) | |
{ // a high bit was set, so there's some non-ASCII in this block; break to scalar loop | |
break; | |
} | |
} | |
index += thisLoop * 8; | |
if (index == sSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const char c{sData[index++]}; | |
if ((c & BIT8) == 0) | |
{ | |
continue; | |
} | |
if ((c & BIT7) == 0) | |
{ | |
throw std::range_error("UTF-8 string character can never start with 10xxxxxx"); | |
} | |
else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF | |
{ | |
if (index == sSize) | |
{ | |
throw std::range_error("UTF-8 string is missing bytes in character"); | |
} | |
const char c2{sData[index++]}; | |
if ((c2 & HI_2_BITS) != BIT8) | |
{ | |
throw std::range_error("UTF-8 continuation byte is missing leading byte"); | |
} | |
// can't require surrogates for 7FF, so we can bail | |
--result; | |
} | |
else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF | |
{ | |
if (sSize - index < 2) | |
{ | |
throw std::range_error("UTF-8 string is missing bytes in character"); | |
} | |
const char c2{sData[index++]}; | |
const char c3{sData[index++]}; | |
if (((c2 | c3) & HI_2_BITS) != BIT8) | |
{ | |
throw std::range_error("UTF-8 continuation byte is missing leading byte"); | |
} | |
result -= 2; | |
} | |
else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF | |
{ | |
if (sSize - index < 3) | |
{ | |
throw std::range_error("UTF-8 string is missing bytes in character"); | |
} | |
const char c2{sData[index++]}; | |
const char c3{sData[index++]}; | |
const char c4{sData[index++]}; | |
if (((c2 | c3 | c4) & HI_2_BITS) != BIT8) | |
{ | |
throw std::range_error("UTF-8 continuation byte is missing leading byte"); | |
} | |
const uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); | |
result -= (3 - (codePoint >= SURROGATE_PAIR_START)); | |
} | |
else | |
{ | |
throw std::range_error("UTF-8 string has invalid Unicode code point"); | |
} | |
} | |
return result; | |
} | |
__declspec(noinline) utf16string sse2_convert_to_utf16(const std::string &s) | |
{ | |
utf16string dest(sse2_count_utf8_to_utf16(s), L'\0'); | |
utf16char * const destData = &dest[0]; | |
const size_t sSize = s.size(); | |
const char * const sData = s.data(); | |
size_t destIndex = 0; | |
for (size_t index = 0; index < sSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(sData + index) & 7) == 0) | |
{ // we're aligned, try SIMD | |
const char * const basisInput = sData + index; | |
utf16char * const basisOutput = destData + destIndex; | |
const size_t maxLoop = (sSize - index) / 8; | |
size_t thisLoop = 0; | |
for (; thisLoop < maxLoop; ++thisLoop) | |
{ | |
const __m128i input = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(basisInput + thisLoop * 8)); | |
if (_mm_movemask_epi8(input)) | |
{ // a high bit was set, so there's some non-ASCII in this block; break to scalar loop | |
break; | |
} | |
__m128i result = _mm_unpacklo_epi8(input, _mm_setzero_si128()); | |
_mm_storeu_si128(reinterpret_cast<__m128i *>(basisOutput + thisLoop * 8), result); | |
} | |
index += thisLoop * 8; | |
destIndex += thisLoop * 8; | |
if (index == sSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const char c{sData[index++]}; | |
if ((c & BIT8) == 0) | |
{ | |
destData[destIndex++] = c; | |
} | |
else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF | |
{ | |
const char c2{sData[index++]}; | |
destData[destIndex++] = ((c & LOW_5BITS) << 6) | (c2 & LOW_6BITS); | |
} | |
else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF | |
{ | |
const char c2{sData[index++]}; | |
const char c3{sData[index++]}; | |
destData[destIndex++] = ((c & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS); | |
} | |
else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF | |
{ | |
const char c2{sData[index++]}; | |
const char c3{sData[index++]}; | |
const char c4{sData[index++]}; | |
uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); | |
if (codePoint >= SURROGATE_PAIR_START) | |
{ | |
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs. | |
// - 0x10000 is subtracted from the code point | |
// - high surrogate is 0xD800 added to the top ten bits | |
// - low surrogate is 0xDC00 added to the low ten bits | |
codePoint -= SURROGATE_PAIR_START; | |
destData[destIndex++] = static_cast<utf16char>((codePoint >> 10) | H_SURROGATE_START); | |
destData[destIndex++] = static_cast<utf16char>((codePoint & 0x3FF) | L_SURROGATE_START); | |
} | |
else | |
{ | |
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value. | |
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode | |
// them if encountered. | |
destData[destIndex++] = static_cast<utf16char>(codePoint); | |
} | |
} | |
} | |
return dest; | |
} | |
inline size_t sse2_count_to_utf8(const utf16string &w) | |
{ | |
const utf16char * const wData = &w[0]; | |
const size_t wSize = w.size(); | |
size_t destSize{wSize}; | |
for (size_t index = 0; index < wSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0) | |
{ // 128 bit aligned, try SIMD | |
const utf16char * const basis = wData + index; | |
const size_t maxLoop = (wSize - index) / 8; | |
size_t basisOffset = 0; | |
for (; basisOffset < maxLoop; ++basisOffset) | |
{ | |
const __m128i asciiShiftOffset = _mm_set1_epi16(0x7FFFu - 0x0080u); | |
const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8)); | |
const __m128i shiftedToTop = _mm_add_epi16(input, asciiShiftOffset); | |
const __m128i compareResults = _mm_cmplt_epi16(shiftedToTop, asciiShiftOffset); | |
if (_mm_movemask_epi8(compareResults)) | |
{ // found non ASCII, break to scalar loop | |
break; | |
} | |
} | |
index += 8 * basisOffset; | |
if (index == wSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const uint16_t ch{wData[index++]}; | |
if (ch > 0x7Fu) // single byte character | |
{ | |
if (ch <= 0x7FFu) // 2 bytes needed (11 bits used) | |
{ | |
++destSize; | |
} | |
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate. | |
{ | |
if (index == wSize) | |
{ | |
throw std::range_error("UTF-16 string is missing low surrogate"); | |
} | |
const auto lowSurrogate = wData[index++]; | |
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) | |
{ | |
throw std::range_error("UTF-16 string has invalid low surrogate"); | |
} | |
destSize += 2; // 4 bytes need using 21 bits | |
} | |
else // 3 bytes needed (16 bits used) | |
{ | |
destSize += 2; | |
} | |
} | |
} | |
return destSize; | |
} | |
__declspec(noinline) std::string sse2_convert_to_utf8(const utf16string &w) | |
{ | |
const utf16char * const wData = &w[0]; | |
const size_t wSize = w.size(); | |
std::string dest(sse2_count_to_utf8(w), '\0'); | |
char * const destData = &dest[0]; | |
size_t destIndex{}; | |
for (size_t index = 0; index < wSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0) | |
{ // 128 bit aligned, try SIMD | |
const utf16char * const basis = wData + index; | |
char * const destBasis = destData + destIndex; | |
const size_t maxLoop = (wSize - index) / 8; | |
size_t basisOffset = 0; | |
for (; basisOffset < maxLoop; ++basisOffset) | |
{ | |
const __m128i asciiShiftOffset = _mm_set1_epi16(0x7FFFu - 0x0080u); | |
const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8)); | |
const __m128i shiftedToTop = _mm_add_epi16(input, asciiShiftOffset); | |
const __m128i compareResults = _mm_cmplt_epi16(shiftedToTop, asciiShiftOffset); | |
if (_mm_movemask_epi8(compareResults)) | |
{ // found non ASCII, break to scalar loop | |
break; | |
} | |
const __m128i result = _mm_packus_epi16(input, _mm_setzero_si128()); | |
_mm_storel_epi64(reinterpret_cast<__m128i *>(destBasis + basisOffset * 8), result); | |
} | |
index += 8 * basisOffset; | |
destIndex += 8 * basisOffset; | |
if (index == wSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const uint16_t ch{wData[index++]}; | |
if (ch <= 0x7Fu) // single byte character | |
{ | |
destData[destIndex++] = static_cast<char>(ch); | |
} | |
else if (ch <= 0x7FFu) // 2 bytes needed (11 bits used) | |
{ | |
destData[destIndex++] = char((ch >> 6) | 0xC0); // leading 5 bits | |
destData[destIndex++] = char((ch & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate. | |
{ | |
const auto highSurrogate = ch; | |
const auto lowSurrogate = wData[index++]; | |
// To get from surrogate pair to Unicode code point: | |
// - subract 0xD800 from high surrogate, this forms top ten bits | |
// - subract 0xDC00 from low surrogate, this forms low ten bits | |
// - add 0x10000 | |
// Leaves a code point in U+10000 to U+10FFFF range. | |
uint32_t codePoint = (((highSurrogate - H_SURROGATE_START) << 10) | |
| (lowSurrogate - L_SURROGATE_START)) + SURROGATE_PAIR_START; | |
// 4 bytes need using 21 bits | |
destData[destIndex++] = char((codePoint >> 18) | 0xF0); // leading 3 bits | |
destData[destIndex++] = char(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits | |
destData[destIndex++] = char(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits | |
destData[destIndex++] = char((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
else // 3 bytes needed (16 bits used) | |
{ | |
destData[destIndex++] = char((ch >> 12) | 0xE0); // leading 4 bits | |
destData[destIndex++] = char(((ch >> 6) & LOW_6BITS) | BIT8); // middle 6 bits | |
destData[destIndex++] = char((ch & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
} | |
return dest; | |
} | |
inline size_t sse4_count_to_utf8(const utf16string &w) | |
{ | |
const utf16char * const wData = &w[0]; | |
const size_t wSize = w.size(); | |
size_t destSize{ wSize }; | |
for (size_t index = 0; index < wSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0) | |
{ // 128 bit aligned, try SIMD | |
const utf16char * const basis = wData + index; | |
const size_t maxLoop = (wSize - index) / 8; | |
size_t basisOffset = 0; | |
for (; basisOffset < maxLoop; ++basisOffset) | |
{ | |
const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8)); | |
if (!_mm_test_all_zeros(input, _mm_set1_epi16(0xFF80u))) | |
{ // found non ASCII, break to scalar loop | |
break; | |
} | |
} | |
index += 8 * basisOffset; | |
if (index == wSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const uint16_t ch{ wData[index++] }; | |
if (ch > 0x7Fu) // single byte character | |
{ | |
if (ch <= 0x7FFu) // 2 bytes needed (11 bits used) | |
{ | |
++destSize; | |
} | |
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate. | |
{ | |
if (index == wSize) | |
{ | |
throw std::range_error("UTF-16 string is missing low surrogate"); | |
} | |
const auto lowSurrogate = wData[index++]; | |
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) | |
{ | |
throw std::range_error("UTF-16 string has invalid low surrogate"); | |
} | |
destSize += 2; // 4 bytes need using 21 bits | |
} | |
else // 3 bytes needed (16 bits used) | |
{ | |
destSize += 2; | |
} | |
} | |
} | |
return destSize; | |
} | |
__declspec(noinline) std::string sse4_convert_to_utf8(const utf16string &w) | |
{ | |
const utf16char * const wData = &w[0]; | |
const size_t wSize = w.size(); | |
std::string dest(sse4_count_to_utf8(w), '\0'); | |
char * const destData = &dest[0]; | |
size_t destIndex{}; | |
for (size_t index = 0; index < wSize;) | |
{ | |
if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0) | |
{ // 128 bit aligned, try SIMD | |
const utf16char * const basis = wData + index; | |
char * const destBasis = destData + destIndex; | |
const size_t maxLoop = (wSize - index) / 8; | |
size_t basisOffset = 0; | |
for (; basisOffset < maxLoop; ++basisOffset) | |
{ | |
const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8)); | |
if (!_mm_test_all_zeros(input, _mm_set1_epi16(0xFF80u))) | |
{ // found non ASCII, break to scalar loop | |
break; | |
} | |
const __m128i result = _mm_packus_epi16(input, _mm_setzero_si128()); | |
_mm_storel_epi64(reinterpret_cast<__m128i *>(destBasis + basisOffset * 8), result); | |
} | |
index += 8 * basisOffset; | |
destIndex += 8 * basisOffset; | |
if (index == wSize) | |
{ | |
break; // we're done :) | |
} | |
} | |
const uint16_t ch{ wData[index++] }; | |
if (ch <= 0x7Fu) // single byte character | |
{ | |
destData[destIndex++] = static_cast<char>(ch); | |
} | |
else if (ch <= 0x7FFu) // 2 bytes needed (11 bits used) | |
{ | |
destData[destIndex++] = char((ch >> 6) | 0xC0); // leading 5 bits | |
destData[destIndex++] = char((ch & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate. | |
{ | |
const auto highSurrogate = ch; | |
const auto lowSurrogate = wData[index++]; | |
// To get from surrogate pair to Unicode code point: | |
// - subract 0xD800 from high surrogate, this forms top ten bits | |
// - subract 0xDC00 from low surrogate, this forms low ten bits | |
// - add 0x10000 | |
// Leaves a code point in U+10000 to U+10FFFF range. | |
uint32_t codePoint = (((highSurrogate - H_SURROGATE_START) << 10) | |
| (lowSurrogate - L_SURROGATE_START)) + SURROGATE_PAIR_START; | |
// 4 bytes need using 21 bits | |
destData[destIndex++] = char((codePoint >> 18) | 0xF0); // leading 3 bits | |
destData[destIndex++] = char(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits | |
destData[destIndex++] = char(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits | |
destData[destIndex++] = char((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
else // 3 bytes needed (16 bits used) | |
{ | |
destData[destIndex++] = char((ch >> 12) | 0xE0); // leading 4 bits | |
destData[destIndex++] = char(((ch >> 6) & LOW_6BITS) | BIT8); // middle 6 bits | |
destData[destIndex++] = char((ch & LOW_6BITS) | BIT8); // trailing 6 bits | |
} | |
} | |
return dest; | |
} | |
#define NOMINMAX | |
#define WIN32_LEAN_AND_MEAN | |
#include <windows.h> | |
__declspec(noinline) utf16string windows_multi_byte_to_wide_char(const std::string s) | |
{ | |
int desiredSize = | |
::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), static_cast<int>(s.size()), nullptr, 0); | |
utf16string dest(desiredSize, L'\0'); | |
::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), static_cast<int>(s.size()), &dest[0], desiredSize); | |
return dest; | |
} | |
__declspec(noinline) std::string windows_wide_char_to_multi_byte(const utf16string& w) | |
{ | |
int desiredSize = | |
::WideCharToMultiByte(CP_UTF8, 0, w.c_str(), static_cast<int>(w.size()), nullptr, 0, nullptr, nullptr); | |
std::string dest(desiredSize, '\0'); | |
::WideCharToMultiByte(CP_UTF8, 0, w.c_str(), static_cast<int>(w.size()), &dest[0], desiredSize, nullptr, nullptr); | |
return dest; | |
} | |
static std::string huckleberryTxt; | |
static utf16string huckleberryTxt16; | |
struct init_huckleberry { | |
init_huckleberry() { | |
read_all("huckleberry.txt", huckleberryTxt); | |
huckleberryTxt16 = windows_multi_byte_to_wide_char(huckleberryTxt); | |
#ifndef KEEP_ZERO_WIDTH_SPACES | |
std::replace(huckleberryTxt16.begin(), huckleberryTxt16.end(), static_cast<wchar_t>(0xA0), static_cast<wchar_t>(0x20)); | |
huckleberryTxt = windows_wide_char_to_multi_byte(huckleberryTxt16); | |
#endif | |
} | |
}; | |
static init_huckleberry init_huckleberry_instance; | |
static const bool always_false = std::chrono::system_clock::now() == std::chrono::system_clock::time_point{}; | |
static void consume(const std::string& b) { | |
if (always_false) { | |
printf("result: %s\n", b.c_str()); | |
} | |
} | |
static void consume(const utf16string& b) { | |
if (always_false) { | |
printf("result: %ls\n", b.c_str()); | |
} | |
} | |
#define MAKE_BENCHMARK(input, output, func) \ | |
static void bench_ ## func (benchmark::State& state) { \ | |
const auto actual = func(input); \ | |
if (actual != output) \ | |
{ \ | |
puts("Bad results in " #func); \ | |
printf("expected size: %zu actual size: %zu\n", output.size(), actual.size()); \ | |
const auto e = std::mismatch(actual.cbegin(), actual.cend(), output.cbegin(), output.cend()); \ | |
printf("at index: %zu", std::min(std::distance(actual.cbegin(), e.first), std::distance(output.cbegin(), e.second))); \ | |
std::terminate(); \ | |
} \ | |
while (state.KeepRunning()) { \ | |
consume(func(input)); \ | |
} \ | |
} \ | |
\ | |
BENCHMARK(bench_ ## func); \ | |
MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, std_wstring_convert_to_utf8); | |
MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, casablanca_to_utf8); | |
MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, sse2_convert_to_utf8); | |
MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, sse4_convert_to_utf8); | |
MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, windows_wide_char_to_multi_byte); | |
MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, std_wstring_convert_to_utf16); | |
MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, casablanca_to_utf16); | |
MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, sse2_convert_to_utf16); | |
MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, windows_multi_byte_to_wide_char); | |
BENCHMARK_MAIN(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
D:\build>.\codeset_conversion.exe | |
07/25/18 14:13:25 | |
Running .\codeset_conversion.exe | |
Run on (12 X 2904 MHz CPU s) | |
CPU Caches: | |
L1 Data 32K (x6) | |
L1 Instruction 32K (x6) | |
L2 Unified 262K (x6) | |
L3 Unified 12582K (x1) | |
----------------------------------------------------------------------------- | |
Benchmark Time CPU Iterations | |
----------------------------------------------------------------------------- | |
bench_std_wstring_convert_to_utf8 2879317 ns 2913136 ns 236 | |
bench_casablanca_to_utf8 1167771 ns 1171875 ns 560 | |
bench_sse2_convert_to_utf8 294400 ns 291561 ns 2358 | |
bench_sse4_convert_to_utf8 278647 ns 276215 ns 2489 | |
bench_windows_wide_char_to_multi_byte 455527 ns 449219 ns 1600 | |
bench_std_wstring_convert_to_utf16 3600100 ns 3599877 ns 204 | |
bench_casablanca_to_utf16 1339548 ns 1317771 ns 498 | |
bench_sse2_convert_to_utf16 432349 ns 429688 ns 1600 | |
bench_windows_multi_byte_to_wide_char 763676 ns 753348 ns 1120 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment