Last active
October 17, 2020 23:25
-
-
Save komiga/5647785 to your computer and use it in GitHub Desktop.
constexpr UTF-8 decoder.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <type_traits> | |
#include <stdexcept> | |
#include <iostream> | |
using Unit = uint8_t; | |
using Point = uint32_t; | |
namespace utf8 { | |
constexpr static uint8_t const | |
s_utf8_trailing[256]{ | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 | |
}; | |
constexpr static std::size_t const | |
s_utf8_offsets[6]{ | |
0x00000000, 0x00003080, 0x000E2080, | |
0x03C82080, 0xFA082080, 0x82082080 | |
}; | |
namespace impl { | |
constexpr bool | |
check_bounds( | |
std::size_t const size, | |
std::size_t const index | |
) noexcept { | |
return index < size; | |
} | |
constexpr unsigned | |
required_first_whole(Unit const first) noexcept { | |
return 1u + s_utf8_trailing[first]; | |
} | |
constexpr unsigned | |
required(Point const p) noexcept { | |
return | |
(p<0x80 || p>0x10FFFF) ? 1u | |
: (p<0x800) ? 2u | |
: (p<0x10000) ? 3u | |
: 4u | |
; | |
} | |
constexpr std::size_t | |
count( | |
Unit const* const data, | |
std::size_t const size, | |
std::size_t const length, | |
std::size_t const idx | |
) noexcept { | |
return size >= idx + required_first_whole(data[idx]) | |
? count( | |
data, size, length + 1u, idx + required_first_whole(data[idx]) | |
) | |
: length | |
; | |
} | |
constexpr Point | |
decode_compose_cascade( | |
Unit const* const data, | |
std::size_t const size, | |
std::size_t const trailing, | |
Point const point, | |
std::size_t const idx | |
) { | |
return check_bounds(size, idx) | |
? (0 < trailing | |
? decode_compose_cascade( | |
data, size, trailing - 1u, (point + data[idx]) << 6u, idx + 1u | |
) | |
: point + data[idx] | |
) | |
: throw std::logic_error("out of bounds") | |
; | |
} | |
constexpr Point | |
decode( | |
Unit const* const data, | |
std::size_t const size, | |
std::size_t const idx | |
) { | |
return | |
decode_compose_cascade( | |
data, size, s_utf8_trailing[data[idx]], Point{0u}, idx | |
) | |
- s_utf8_offsets[s_utf8_trailing[data[idx]]] | |
; | |
} | |
constexpr Point | |
point_at( | |
Unit const* const data, | |
std::size_t const size, | |
std::size_t const point_idx, | |
std::size_t const current_point, | |
std::size_t const current_unit | |
) { | |
return check_bounds(size, current_unit) | |
? (point_idx == current_point | |
? decode(data, size, current_unit) | |
: point_at( | |
data, size, point_idx, current_point + 1u, | |
current_unit + required_first_whole(data[current_unit]) | |
) | |
) | |
: throw std::logic_error("out of bounds") | |
; | |
} | |
} // namespace impl | |
template<std::size_t N> | |
constexpr std::size_t | |
count(Unit const (&data)[N]) { | |
return impl::count( | |
data, N, 0u, 0u | |
); | |
} | |
constexpr std::size_t | |
count( | |
Unit const* const data, | |
std::size_t const size | |
) { | |
return impl::count( | |
data, size, 0u, 0u | |
); | |
} | |
template<std::size_t N> | |
constexpr Point | |
point_at( | |
Unit const (&data)[N], | |
std::size_t const point_idx | |
) { | |
return impl::point_at( | |
data, N, point_idx, 0u, 0u | |
); | |
} | |
constexpr Point | |
point_at( | |
Unit const* const data, | |
std::size_t const size, | |
std::size_t const point_idx | |
) { | |
return impl::point_at( | |
data, size, point_idx, 0u, 0u | |
); | |
} | |
} // namespace utf8 | |
template<typename T> | |
constexpr std::size_t | |
ce_strlen(T const* const str) { | |
return (nullptr == str || *str == '\0') | |
? 0u | |
: 1u + ce_strlen<T>(str + 1u) | |
; | |
} | |
constexpr static Unit const | |
s_str_kana_yes[] = u8"はい"; | |
int main(int argc, char* argv[]) { | |
Unit const* str = s_str_kana_yes; | |
if (1 < argc) { | |
str = reinterpret_cast<Unit*>(argv[1]); | |
} | |
std::size_t size = ce_strlen(str); | |
std::size_t const count = utf8::count(str, size); | |
std::cout | |
<< "count: " << count | |
<< " points:" <<std ::endl | |
; | |
std::cout << std::hex; | |
for (std::size_t idx = 0u; idx < count; ++idx) { | |
std::cout << ' ' << utf8::point_at(str, size, idx); | |
} | |
std::cout << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment