BillyONeal · July 25, 2018 21:15
diff --git a/codeset_conversion.cpp b/codeset_conversion.cpp
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <chrono>
 #include <codecvt>
 #include <locale>
 #include <stdexcept>
 #include <string>
 #include <intrin.h>
 #include <smmintrin.h>
 #include <benchmark/benchmark.h>
 #include "file.hpp"

 static_assert(sizeof(char16_t) == sizeof(wchar_t), "BOOM");
 using utf16char = wchar_t;
 using utf16string = std::basic_string<wchar_t>;

 __declspec(noinline) utf16string std_wstring_convert_to_utf16(const std::string& src) {
    std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
    return conversion.from_bytes(src);
 }

 __declspec(noinline) std::string std_wstring_convert_to_utf8(const utf16string& src) {
    std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
    return conversion.to_bytes(src);
 }

 #define LOW_3BITS 0x7
 #define LOW_4BITS 0xF
 #define LOW_5BITS 0x1F
 #define LOW_6BITS 0x3F
 #define HI_2_BITS 0xC0
 #define BIT4 0x8
 #define BIT5 0x10
 #define BIT6 0x20
 #define BIT7 0x40
 #define BIT8 0x80
 #define L_SURROGATE_START 0xDC00
 #define L_SURROGATE_END 0xDFFF
 #define H_SURROGATE_START 0xD800
 #define H_SURROGATE_END 0xDBFF
 #define SURROGATE_PAIR_START 0x10000

 __declspec(noinline) utf16string casablanca_to_utf16(const std::string &s)
 {
    utf16string dest;
    // Save repeated heap allocations, use less than source string size assuming some
    // of the characters are not just ASCII and collapse.
    dest.reserve(static_cast<size_t>(static_cast<double>(s.size()) * .70));

    for (auto src = s.begin(); src != s.end(); ++src)
    {
        if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F
        {
            dest.push_back(utf16string::value_type(*src));
        }
        else
        {
            unsigned char numContBytes = 0;
            uint32_t codePoint;
            if ((*src & BIT7) == 0)
            {
                throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
            }
            else if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
            {
                codePoint = *src & LOW_5BITS;
                numContBytes = 1;
            }
            else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
            {
                codePoint = *src & LOW_4BITS;
                numContBytes = 2;
            }
            else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
            {
                codePoint = *src & LOW_3BITS;
                numContBytes = 3;
            }
            else
            {
                throw std::range_error("UTF-8 string has invalid Unicode code point");
            }

            for (unsigned char i = 0; i < numContBytes; ++i)
            {
                if (++src == s.end())
                {
                    throw std::range_error("UTF-8 string is missing bytes in character");
                }
                if ((*src & BIT8) == 0 || (*src & BIT7) != 0)
                {
                    throw std::range_error("UTF-8 continuation byte is missing leading byte");
                }
                codePoint <<= 6;
                codePoint |= *src & LOW_6BITS;
            }

            if (codePoint >= SURROGATE_PAIR_START)
            {
                // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
                //  - 0x10000 is subtracted from the code point
                //  - high surrogate is 0xD800 added to the top ten bits
                //  - low surrogate is 0xDC00 added to the low ten bits
                codePoint -= SURROGATE_PAIR_START;
                dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START));
                dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START));
            }
            else
            {
                // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
                // U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
                // them if encountered.
                dest.push_back(utf16string::value_type(codePoint));
            }
        }
    }
    return dest;
 }

 __declspec(noinline) std::string casablanca_to_utf8(const utf16string &w)
 {
    std::string dest;
    dest.reserve(w.size());
    for (auto src = w.begin(); src != w.end(); ++src)
    {
        // Check for high surrogate.
        if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END)
        {
            const auto highSurrogate = *src++;
            if (src == w.end())
            {
                throw std::range_error("UTF-16 string is missing low surrogate");
            }
            const auto lowSurrogate = *src;
            if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
            {
                throw std::range_error("UTF-16 string has invalid low surrogate");
            }

            // To get from surrogate pair to Unicode code point:
            // - subract 0xD800 from high surrogate, this forms top ten bits
            // - subract 0xDC00 from low surrogate, this forms low ten bits
            // - add 0x10000
            // Leaves a code point in U+10000 to U+10FFFF range.
            uint32_t codePoint = highSurrogate - H_SURROGATE_START;
            codePoint <<= 10;
            codePoint |= lowSurrogate - L_SURROGATE_START;
            codePoint += SURROGATE_PAIR_START;

            // 4 bytes need using 21 bits
            dest.push_back(char((codePoint >> 18) | 0xF0));                 // leading 3 bits
            dest.push_back(char(((codePoint >> 12) & LOW_6BITS) | BIT8));   // next 6 bits
            dest.push_back(char(((codePoint >> 6) & LOW_6BITS) | BIT8));    // next 6 bits
            dest.push_back(char((codePoint & LOW_6BITS) | BIT8));           // trailing 6 bits
        }
        else
        {
            if (*src <= 0x7F) // single byte character
            {
                dest.push_back(static_cast<char>(*src));
            }
            else if (*src <= 0x7FF) // 2 bytes needed (11 bits used)
            {
                dest.push_back(char((*src >> 6) | 0xC0));               // leading 5 bits
                dest.push_back(char((*src & LOW_6BITS) | BIT8));        // trailing 6 bits
            }
            else // 3 bytes needed (16 bits used)
            {
                dest.push_back(char((*src >> 12) | 0xE0));              // leading 4 bits
                dest.push_back(char(((*src >> 6) & LOW_6BITS) | BIT8)); // middle 6 bits
                dest.push_back(char((*src & LOW_6BITS) | BIT8));        // trailing 6 bits
            }
        }
    }

    return dest;
 }


 static_assert(sizeof(size_t) == 4 || sizeof(size_t) == 8, "This code assumes 32 bit or 64 bit platform");
 constexpr size_t allAsciiInUtf8Mask{ sizeof(size_t) == 4 ? 0x80808080u : 0x8080808080808080u };
 constexpr size_t allAsciiInUtf16Mask{ sizeof(size_t) == 4 ? 0xFF80FF80u : 0xFF80FF80FF80FF80u };

 inline size_t sse2_count_utf8_to_utf16(const std::string& s)
 {
    const size_t sSize = s.size();
    const char* const sData = s.data();
    size_t result{sSize}; // only pay to change this value if non-ASCII values are seen
    for (size_t index = 0; index < sSize;)
    {
 		if ((reinterpret_cast<uintptr_t>(sData + index) & 7) == 0)
 		{   // we're aligned, try SIMD
 			const char * const basisInput = sData + index;
 			const size_t maxLoop = (sSize - index) / 8;
 			size_t thisLoop = 0;
 			for (; thisLoop < maxLoop; ++thisLoop)
 			{
 				const __m128i input = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(basisInput + thisLoop * 8));
 				if (_mm_movemask_epi8(input))
 				{	// a high bit was set, so there's some non-ASCII in this block; break to scalar loop
 					break;
 				}
 			}

 			index += thisLoop * 8;

 			if (index == sSize)
 			{
 				break; // we're done :)
 			}
 		}

        const char c{sData[index++]};
        if ((c & BIT8) == 0)
        {
            continue;
        }

        if ((c & BIT7) == 0)
        {
            throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
        }
        else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
        {
            if (index == sSize)
            {
                throw std::range_error("UTF-8 string is missing bytes in character");
            }

            const char c2{sData[index++]};
            if ((c2 & HI_2_BITS) != BIT8)
            {
                throw std::range_error("UTF-8 continuation byte is missing leading byte");
            }

            // can't require surrogates for 7FF, so we can bail
            --result;
        }
        else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
        {
            if (sSize - index < 2)
            {
                throw std::range_error("UTF-8 string is missing bytes in character");
            }

            const char c2{sData[index++]};
            const char c3{sData[index++]};
            if (((c2 | c3) & HI_2_BITS) != BIT8)
            {
                throw std::range_error("UTF-8 continuation byte is missing leading byte");
            }

            result -= 2;
        }
        else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
        {
            if (sSize - index < 3)
            {
                throw std::range_error("UTF-8 string is missing bytes in character");
            }

            const char c2{sData[index++]};
            const char c3{sData[index++]};
            const char c4{sData[index++]};
            if (((c2 | c3 | c4) & HI_2_BITS) != BIT8)
            {
                throw std::range_error("UTF-8 continuation byte is missing leading byte");
            }

            const uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
            result -= (3 - (codePoint >= SURROGATE_PAIR_START));
        }
        else
        {
            throw std::range_error("UTF-8 string has invalid Unicode code point");
        }
    }

    return result;
 }

 __declspec(noinline) utf16string sse2_convert_to_utf16(const std::string &s)
 {
    utf16string dest(sse2_count_utf8_to_utf16(s), L'\0');
    utf16char * const destData = &dest[0];
    const size_t sSize = s.size();
    const char * const sData = s.data();
    size_t destIndex = 0;
    for (size_t index = 0; index < sSize;)
    {
        if ((reinterpret_cast<uintptr_t>(sData + index) & 7) == 0)
        {   // we're aligned, try SIMD
 			const char * const basisInput = sData + index;
 			utf16char * const basisOutput = destData + destIndex;
 			const size_t maxLoop = (sSize - index) / 8;
 			size_t thisLoop = 0;
 			for (; thisLoop < maxLoop; ++thisLoop)
            {
                const __m128i input = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(basisInput + thisLoop * 8));
 				if (_mm_movemask_epi8(input))
 				{	// a high bit was set, so there's some non-ASCII in this block; break to scalar loop
 					break;
 				}

                __m128i result = _mm_unpacklo_epi8(input, _mm_setzero_si128());
                _mm_storeu_si128(reinterpret_cast<__m128i *>(basisOutput + thisLoop * 8), result);
            }

            index += thisLoop * 8;
            destIndex += thisLoop * 8;

            if (index == sSize)
            {
                break; // we're done :)
            }
        }

        const char c{sData[index++]};
        if ((c & BIT8) == 0)
        {
            destData[destIndex++] = c;
        }
        else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
        {
            const char c2{sData[index++]};
            destData[destIndex++] = ((c & LOW_5BITS) << 6) | (c2 & LOW_6BITS);
        }
        else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
        {
            const char c2{sData[index++]};
            const char c3{sData[index++]};
            destData[destIndex++] = ((c & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS);
        }
        else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
        {
            const char c2{sData[index++]};
            const char c3{sData[index++]};
            const char c4{sData[index++]};
            uint32_t codePoint = ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
            if (codePoint >= SURROGATE_PAIR_START)
            {
                // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
                //  - 0x10000 is subtracted from the code point
                //  - high surrogate is 0xD800 added to the top ten bits
                //  - low surrogate is 0xDC00 added to the low ten bits
                codePoint -= SURROGATE_PAIR_START;
                destData[destIndex++] = static_cast<utf16char>((codePoint >> 10) | H_SURROGATE_START);
                destData[destIndex++] = static_cast<utf16char>((codePoint & 0x3FF) | L_SURROGATE_START);
            }
            else
            {
                // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
                // U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
                // them if encountered.
                destData[destIndex++] = static_cast<utf16char>(codePoint);
            }
        }
    }

    return dest;
 }

 inline size_t sse2_count_to_utf8(const utf16string &w)
 {
    const utf16char * const wData = &w[0];
    const size_t wSize = w.size();
    size_t destSize{wSize};
    for (size_t index = 0; index < wSize;)
    {
        if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0)
        {   // 128 bit aligned, try SIMD
            const utf16char * const basis = wData + index;
            const size_t maxLoop = (wSize - index) / 8;
            size_t basisOffset = 0;
            for (; basisOffset < maxLoop; ++basisOffset)
            {
 				const __m128i asciiShiftOffset = _mm_set1_epi16(0x7FFFu - 0x0080u);
                const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8));
 				const __m128i shiftedToTop = _mm_add_epi16(input, asciiShiftOffset);
 				const __m128i compareResults = _mm_cmplt_epi16(shiftedToTop, asciiShiftOffset);
 				if (_mm_movemask_epi8(compareResults))
                {   // found non ASCII, break to scalar loop
                    break;
                }
            }

            index += 8 * basisOffset;

            if (index == wSize)
            {
                break; // we're done :)
            }
        }

        const uint16_t ch{wData[index++]};
        if (ch > 0x7Fu) // single byte character
        {
            if (ch <= 0x7FFu) // 2 bytes needed (11 bits used)
            {
                ++destSize;
            }
            else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate.
            {
                if (index == wSize)
                {
                    throw std::range_error("UTF-16 string is missing low surrogate");
                }

                const auto lowSurrogate = wData[index++];
                if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
                {
                    throw std::range_error("UTF-16 string has invalid low surrogate");
                }

                destSize += 2; // 4 bytes need using 21 bits
            }
            else // 3 bytes needed (16 bits used)
            {
                destSize += 2;
            }
        }
    }

    return destSize;
 }

 __declspec(noinline) std::string sse2_convert_to_utf8(const utf16string &w)
 {
    const utf16char * const wData = &w[0];
    const size_t wSize = w.size();
    std::string dest(sse2_count_to_utf8(w), '\0');
    char * const destData = &dest[0];
    size_t destIndex{};
    for (size_t index = 0; index < wSize;)
    {
        if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0)
        {   // 128 bit aligned, try SIMD
            const utf16char * const basis = wData + index;
            char * const destBasis = destData + destIndex;
            const size_t maxLoop = (wSize - index) / 8;
            size_t basisOffset = 0;
            for (; basisOffset < maxLoop; ++basisOffset)
            {
 				const __m128i asciiShiftOffset = _mm_set1_epi16(0x7FFFu - 0x0080u);
 				const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8));
 				const __m128i shiftedToTop = _mm_add_epi16(input, asciiShiftOffset);
 				const __m128i compareResults = _mm_cmplt_epi16(shiftedToTop, asciiShiftOffset);
 				if (_mm_movemask_epi8(compareResults))
 				{   // found non ASCII, break to scalar loop
 					break;
 				}

 				const __m128i result = _mm_packus_epi16(input, _mm_setzero_si128());
                _mm_storel_epi64(reinterpret_cast<__m128i *>(destBasis + basisOffset * 8), result);
            }

            index += 8 * basisOffset;
            destIndex += 8 * basisOffset;

            if (index == wSize)
            {
 				break; // we're done :)
            }
        }

        const uint16_t ch{wData[index++]};
        if (ch <= 0x7Fu) // single byte character
        {
            destData[destIndex++] = static_cast<char>(ch);
        }
        else if (ch <= 0x7FFu) // 2 bytes needed (11 bits used)
        {
            destData[destIndex++] = char((ch >> 6) | 0xC0);               // leading 5 bits
            destData[destIndex++] = char((ch & LOW_6BITS) | BIT8);        // trailing 6 bits
        }
        else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate.
        {
            const auto highSurrogate = ch;
            const auto lowSurrogate = wData[index++];
            // To get from surrogate pair to Unicode code point:
            // - subract 0xD800 from high surrogate, this forms top ten bits
            // - subract 0xDC00 from low surrogate, this forms low ten bits
            // - add 0x10000
            // Leaves a code point in U+10000 to U+10FFFF range.
            uint32_t codePoint = (((highSurrogate - H_SURROGATE_START) << 10)
                | (lowSurrogate - L_SURROGATE_START)) + SURROGATE_PAIR_START;

            // 4 bytes need using 21 bits
            destData[destIndex++] = char((codePoint >> 18) | 0xF0);                 // leading 3 bits
            destData[destIndex++] = char(((codePoint >> 12) & LOW_6BITS) | BIT8);   // next 6 bits
            destData[destIndex++] = char(((codePoint >> 6) & LOW_6BITS) | BIT8);    // next 6 bits
            destData[destIndex++] = char((codePoint & LOW_6BITS) | BIT8);           // trailing 6 bits
        }
        else // 3 bytes needed (16 bits used)
        {
            destData[destIndex++] = char((ch >> 12) | 0xE0);              // leading 4 bits
            destData[destIndex++] = char(((ch >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
            destData[destIndex++] = char((ch & LOW_6BITS) | BIT8);        // trailing 6 bits
        }
    }

    return dest;
 }


 inline size_t sse4_count_to_utf8(const utf16string &w)
 {
 	const utf16char * const wData = &w[0];
 	const size_t wSize = w.size();
 	size_t destSize{ wSize };
 	for (size_t index = 0; index < wSize;)
 	{
 		if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0)
 		{   // 128 bit aligned, try SIMD
 			const utf16char * const basis = wData + index;
 			const size_t maxLoop = (wSize - index) / 8;
 			size_t basisOffset = 0;
 			for (; basisOffset < maxLoop; ++basisOffset)
 			{
 				const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8));
 				if (!_mm_test_all_zeros(input, _mm_set1_epi16(0xFF80u)))
 				{   // found non ASCII, break to scalar loop
 					break;
 				}
 			}

 			index += 8 * basisOffset;

 			if (index == wSize)
 			{
 				break; // we're done :)
 			}
 		}

 		const uint16_t ch{ wData[index++] };
 		if (ch > 0x7Fu) // single byte character
 		{
 			if (ch <= 0x7FFu) // 2 bytes needed (11 bits used)
 			{
 				++destSize;
 			}
 			else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate.
 			{
 				if (index == wSize)
 				{
 					throw std::range_error("UTF-16 string is missing low surrogate");
 				}

 				const auto lowSurrogate = wData[index++];
 				if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
 				{
 					throw std::range_error("UTF-16 string has invalid low surrogate");
 				}

 				destSize += 2; // 4 bytes need using 21 bits
 			}
 			else // 3 bytes needed (16 bits used)
 			{
 				destSize += 2;
 			}
 		}
 	}

 	return destSize;
 }

 __declspec(noinline) std::string sse4_convert_to_utf8(const utf16string &w)
 {
 	const utf16char * const wData = &w[0];
 	const size_t wSize = w.size();
 	std::string dest(sse4_count_to_utf8(w), '\0');
 	char * const destData = &dest[0];
 	size_t destIndex{};
 	for (size_t index = 0; index < wSize;)
 	{
 		if ((reinterpret_cast<uintptr_t>(wData + index) & 15) == 0)
 		{   // 128 bit aligned, try SIMD
 			const utf16char * const basis = wData + index;
 			char * const destBasis = destData + destIndex;
 			const size_t maxLoop = (wSize - index) / 8;
 			size_t basisOffset = 0;
 			for (; basisOffset < maxLoop; ++basisOffset)
 			{
 				const __m128i input = _mm_load_si128(reinterpret_cast<const __m128i *>(basis + basisOffset * 8));
 				if (!_mm_test_all_zeros(input, _mm_set1_epi16(0xFF80u)))
 				{   // found non ASCII, break to scalar loop
 					break;
 				}

 				const __m128i result = _mm_packus_epi16(input, _mm_setzero_si128());
 				_mm_storel_epi64(reinterpret_cast<__m128i *>(destBasis + basisOffset * 8), result);
 			}

 			index += 8 * basisOffset;
 			destIndex += 8 * basisOffset;

 			if (index == wSize)
 			{
 				break; // we're done :)
 			}
 		}

 		const uint16_t ch{ wData[index++] };
 		if (ch <= 0x7Fu) // single byte character
 		{
 			destData[destIndex++] = static_cast<char>(ch);
 		}
 		else if (ch <= 0x7FFu) // 2 bytes needed (11 bits used)
 		{
 			destData[destIndex++] = char((ch >> 6) | 0xC0);               // leading 5 bits
 			destData[destIndex++] = char((ch & LOW_6BITS) | BIT8);        // trailing 6 bits
 		}
 		else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // Check for high surrogate.
 		{
 			const auto highSurrogate = ch;
 			const auto lowSurrogate = wData[index++];
 			// To get from surrogate pair to Unicode code point:
 			// - subract 0xD800 from high surrogate, this forms top ten bits
 			// - subract 0xDC00 from low surrogate, this forms low ten bits
 			// - add 0x10000
 			// Leaves a code point in U+10000 to U+10FFFF range.
 			uint32_t codePoint = (((highSurrogate - H_SURROGATE_START) << 10)
 				| (lowSurrogate - L_SURROGATE_START)) + SURROGATE_PAIR_START;

 			// 4 bytes need using 21 bits
 			destData[destIndex++] = char((codePoint >> 18) | 0xF0);                 // leading 3 bits
 			destData[destIndex++] = char(((codePoint >> 12) & LOW_6BITS) | BIT8);   // next 6 bits
 			destData[destIndex++] = char(((codePoint >> 6) & LOW_6BITS) | BIT8);    // next 6 bits
 			destData[destIndex++] = char((codePoint & LOW_6BITS) | BIT8);           // trailing 6 bits
 		}
 		else // 3 bytes needed (16 bits used)
 		{
 			destData[destIndex++] = char((ch >> 12) | 0xE0);              // leading 4 bits
 			destData[destIndex++] = char(((ch >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
 			destData[destIndex++] = char((ch & LOW_6BITS) | BIT8);        // trailing 6 bits
 		}
 	}

 	return dest;
 }

 #define NOMINMAX
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>

 __declspec(noinline) utf16string windows_multi_byte_to_wide_char(const std::string s)
 {
    int desiredSize =
        ::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), static_cast<int>(s.size()), nullptr, 0);
    utf16string dest(desiredSize, L'\0');
    ::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), static_cast<int>(s.size()), &dest[0], desiredSize);
    return dest;
 }

 __declspec(noinline) std::string windows_wide_char_to_multi_byte(const utf16string& w)
 {
    int desiredSize =
        ::WideCharToMultiByte(CP_UTF8, 0, w.c_str(), static_cast<int>(w.size()), nullptr, 0, nullptr, nullptr);
    std::string dest(desiredSize, '\0');
    ::WideCharToMultiByte(CP_UTF8, 0, w.c_str(), static_cast<int>(w.size()), &dest[0], desiredSize, nullptr, nullptr);
    return dest;
 }

 static std::string huckleberryTxt;
 static utf16string huckleberryTxt16;

 struct init_huckleberry {
    init_huckleberry() {
        read_all("huckleberry.txt", huckleberryTxt);
        huckleberryTxt16 = windows_multi_byte_to_wide_char(huckleberryTxt);
 #ifndef KEEP_ZERO_WIDTH_SPACES
 		std::replace(huckleberryTxt16.begin(), huckleberryTxt16.end(), static_cast<wchar_t>(0xA0), static_cast<wchar_t>(0x20));
 		huckleberryTxt = windows_wide_char_to_multi_byte(huckleberryTxt16);
 #endif
    }
 };

 static init_huckleberry init_huckleberry_instance;

 static const bool always_false = std::chrono::system_clock::now() == std::chrono::system_clock::time_point{};
 static void consume(const std::string& b) {
    if (always_false) {
        printf("result: %s\n", b.c_str());
    }
 }
 static void consume(const utf16string& b) {
    if (always_false) {
        printf("result: %ls\n", b.c_str());
    }
 }

 #define MAKE_BENCHMARK(input, output, func) \
 static void bench_ ## func (benchmark::State& state) { \
    const auto actual = func(input); \
    if (actual != output) \
    {   \
        puts("Bad results in " #func); \
        printf("expected size: %zu actual size: %zu\n", output.size(), actual.size()); \
        const auto e = std::mismatch(actual.cbegin(), actual.cend(), output.cbegin(), output.cend()); \
        printf("at index: %zu", std::min(std::distance(actual.cbegin(), e.first), std::distance(output.cbegin(), e.second))); \
        std::terminate(); \
    } \
    while (state.KeepRunning()) { \
        consume(func(input)); \
    } \
 } \
 \
 BENCHMARK(bench_ ## func); \


 MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, std_wstring_convert_to_utf8);
 MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, casablanca_to_utf8);
 MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, sse2_convert_to_utf8);
 MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, sse4_convert_to_utf8);
 MAKE_BENCHMARK(huckleberryTxt16, huckleberryTxt, windows_wide_char_to_multi_byte);

 MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, std_wstring_convert_to_utf16);
 MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, casablanca_to_utf16);
 MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, sse2_convert_to_utf16);
 MAKE_BENCHMARK(huckleberryTxt, huckleberryTxt16, windows_multi_byte_to_wide_char);

 BENCHMARK_MAIN();
diff --git a/Results.txt b/Results.txt
 D:\build>.\codeset_conversion.exe
 07/25/18 14:13:25
 Running .\codeset_conversion.exe
 Run on (12 X 2904 MHz CPU s)
 CPU Caches:
  L1 Data 32K (x6)
  L1 Instruction 32K (x6)
  L2 Unified 262K (x6)
  L3 Unified 12582K (x1)
 -----------------------------------------------------------------------------
 Benchmark                                      Time           CPU Iterations
 -----------------------------------------------------------------------------
 bench_std_wstring_convert_to_utf8        2879317 ns    2913136 ns        236
 bench_casablanca_to_utf8                 1167771 ns    1171875 ns        560
 bench_sse2_convert_to_utf8                294400 ns     291561 ns       2358
 bench_sse4_convert_to_utf8                278647 ns     276215 ns       2489
 bench_windows_wide_char_to_multi_byte     455527 ns     449219 ns       1600
 bench_std_wstring_convert_to_utf16       3600100 ns    3599877 ns        204
 bench_casablanca_to_utf16                1339548 ns    1317771 ns        498
 bench_sse2_convert_to_utf16               432349 ns     429688 ns       1600
 bench_windows_multi_byte_to_wide_char     763676 ns     753348 ns       1120
	D:\build>.\codeset_conversion.exe
	07/25/18 14:13:25
	Running .\codeset_conversion.exe
	Run on (12 X 2904 MHz CPU s)
	CPU Caches:
	L1 Data 32K (x6)
	L1 Instruction 32K (x6)
	L2 Unified 262K (x6)
	L3 Unified 12582K (x1)
	-----------------------------------------------------------------------------
	Benchmark Time CPU Iterations
	-----------------------------------------------------------------------------
	bench_std_wstring_convert_to_utf8 2879317 ns 2913136 ns 236
	bench_casablanca_to_utf8 1167771 ns 1171875 ns 560
	bench_sse2_convert_to_utf8 294400 ns 291561 ns 2358
	bench_sse4_convert_to_utf8 278647 ns 276215 ns 2489
	bench_windows_wide_char_to_multi_byte 455527 ns 449219 ns 1600
	bench_std_wstring_convert_to_utf16 3600100 ns 3599877 ns 204
	bench_casablanca_to_utf16 1339548 ns 1317771 ns 498
	bench_sse2_convert_to_utf16 432349 ns 429688 ns 1600
	bench_windows_multi_byte_to_wide_char 763676 ns 753348 ns 1120