Skip to content

Instantly share code, notes, and snippets.

@komiga
Last active October 17, 2020 23:25
Show Gist options
  • Save komiga/5647785 to your computer and use it in GitHub Desktop.
Save komiga/5647785 to your computer and use it in GitHub Desktop.
constexpr UTF-8 decoder.
#include <cstdint>
#include <type_traits>
#include <stdexcept>
#include <iostream>
using Unit = uint8_t;
using Point = uint32_t;
namespace utf8 {
constexpr static uint8_t const
s_utf8_trailing[256]{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
constexpr static std::size_t const
s_utf8_offsets[6]{
0x00000000, 0x00003080, 0x000E2080,
0x03C82080, 0xFA082080, 0x82082080
};
namespace impl {
constexpr bool
check_bounds(
std::size_t const size,
std::size_t const index
) noexcept {
return index < size;
}
constexpr unsigned
required_first_whole(Unit const first) noexcept {
return 1u + s_utf8_trailing[first];
}
constexpr unsigned
required(Point const p) noexcept {
return
(p<0x80 || p>0x10FFFF) ? 1u
: (p<0x800) ? 2u
: (p<0x10000) ? 3u
: 4u
;
}
constexpr std::size_t
count(
Unit const* const data,
std::size_t const size,
std::size_t const length,
std::size_t const idx
) noexcept {
return size >= idx + required_first_whole(data[idx])
? count(
data, size, length + 1u, idx + required_first_whole(data[idx])
)
: length
;
}
constexpr Point
decode_compose_cascade(
Unit const* const data,
std::size_t const size,
std::size_t const trailing,
Point const point,
std::size_t const idx
) {
return check_bounds(size, idx)
? (0 < trailing
? decode_compose_cascade(
data, size, trailing - 1u, (point + data[idx]) << 6u, idx + 1u
)
: point + data[idx]
)
: throw std::logic_error("out of bounds")
;
}
constexpr Point
decode(
Unit const* const data,
std::size_t const size,
std::size_t const idx
) {
return
decode_compose_cascade(
data, size, s_utf8_trailing[data[idx]], Point{0u}, idx
)
- s_utf8_offsets[s_utf8_trailing[data[idx]]]
;
}
constexpr Point
point_at(
Unit const* const data,
std::size_t const size,
std::size_t const point_idx,
std::size_t const current_point,
std::size_t const current_unit
) {
return check_bounds(size, current_unit)
? (point_idx == current_point
? decode(data, size, current_unit)
: point_at(
data, size, point_idx, current_point + 1u,
current_unit + required_first_whole(data[current_unit])
)
)
: throw std::logic_error("out of bounds")
;
}
} // namespace impl
template<std::size_t N>
constexpr std::size_t
count(Unit const (&data)[N]) {
return impl::count(
data, N, 0u, 0u
);
}
constexpr std::size_t
count(
Unit const* const data,
std::size_t const size
) {
return impl::count(
data, size, 0u, 0u
);
}
template<std::size_t N>
constexpr Point
point_at(
Unit const (&data)[N],
std::size_t const point_idx
) {
return impl::point_at(
data, N, point_idx, 0u, 0u
);
}
constexpr Point
point_at(
Unit const* const data,
std::size_t const size,
std::size_t const point_idx
) {
return impl::point_at(
data, size, point_idx, 0u, 0u
);
}
} // namespace utf8
template<typename T>
constexpr std::size_t
ce_strlen(T const* const str) {
return (nullptr == str || *str == '\0')
? 0u
: 1u + ce_strlen<T>(str + 1u)
;
}
constexpr static Unit const
s_str_kana_yes[] = u8"はい";
int main(int argc, char* argv[]) {
Unit const* str = s_str_kana_yes;
if (1 < argc) {
str = reinterpret_cast<Unit*>(argv[1]);
}
std::size_t size = ce_strlen(str);
std::size_t const count = utf8::count(str, size);
std::cout
<< "count: " << count
<< " points:" <<std ::endl
;
std::cout << std::hex;
for (std::size_t idx = 0u; idx < count; ++idx) {
std::cout << ' ' << utf8::point_at(str, size, idx);
}
std::cout << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment