Created
September 3, 2023 04:05
-
-
Save ttsugriy/be4597c93f0c3b89cc91708cae2e7111 to your computer and use it in GitHub Desktop.
utf8 length from utf32 benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <cstdint> | |
#include <cuchar> | |
size_t utf8_length_from_utf32(const char32_t* buf, size_t len) { | |
// We are not BOM aware. | |
const uint32_t* p = reinterpret_cast<const uint32_t*>(buf); | |
size_t counter{0}; | |
for (size_t i = 0; i < len; i++) { | |
/** ASCII **/ | |
if (p[i] <= 0x7F) { | |
counter++; | |
} | |
/** two-byte **/ | |
else if (p[i] <= 0x7FF) { | |
counter += 2; | |
} | |
/** three-byte **/ | |
else if (p[i] <= 0xFFFF) { | |
counter += 3; | |
} | |
/** four-bytes **/ | |
else { | |
counter += 4; | |
} | |
} | |
return counter; | |
} | |
size_t utf8_length_from_utf32v(const char32_t* buf, size_t len) { | |
// We are not BOM aware. | |
const uint32_t* p = reinterpret_cast<const uint32_t*>(buf); | |
size_t counter{0}; | |
for (size_t i = 0; i < len; i++) { | |
++counter; /** ASCII **/ | |
counter += static_cast<size_t>(p[i] > 0x7F); /** two-byte **/ | |
counter += static_cast<size_t>(p[i] > 0x7FF); /** three-byte **/ | |
counter += static_cast<size_t>(p[i] > 0xFFFF); /** four-bytes **/ | |
} | |
return counter; | |
} | |
const char32_t text[] = U"eckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wfeckwd4c7cu47r2wf"; | |
// const char32_t text[] = U"MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前MajiでKoiする5秒前"; | |
static void BH_length(benchmark::State& state) { | |
const auto text = (const char32_t *)state.range(0); | |
for (auto _ : state) { | |
benchmark::DoNotOptimize(utf8_length_from_utf32(text, 129)); | |
} | |
} | |
BENCHMARK(BH_length)->Arg((int64_t)text); | |
static void BH_lengthVec(benchmark::State& state) { | |
const auto text = (const char32_t *)state.range(0); | |
for (auto _ : state) { | |
benchmark::DoNotOptimize(utf8_length_from_utf32v(text, 129)); | |
} | |
} | |
BENCHMARK(BH_lengthVec)->Arg((int64_t)text); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment