Last active
September 21, 2022 07:57
-
-
Save sayurin/59977a7f2f9241e3c22c1b48c25d8fb6 to your computer and use it in GitHub Desktop.
HFS+に出現しない文字、出現する文字
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <charconv> | |
#include <fstream> | |
#include <iostream> | |
#include <map> | |
#include <regex> | |
#include <set> | |
#include <string> | |
#include <string_view> | |
#include <cassert> | |
using namespace std::literals; | |
static auto hex(std::ssub_match const& sm) { | |
int code; | |
std::from_chars(&*sm.first, &*sm.second, code, 16); | |
return code; | |
} | |
static auto hex(int code) { | |
assert(0 <= code && code < 0x100); | |
std::string hex{ R"(\x00)"sv }; | |
std::to_chars(data(hex) + (code < 0x10 ? 3 : 2), data(hex) + size(hex), code, 16); | |
return hex; | |
} | |
static auto join(std::vector<std::string> const& terms) { | |
if (size(terms) == 1) | |
return terms.front(); | |
std::string pattern{ "(?:"sv }; | |
for (auto const& term : terms) { | |
pattern += term; | |
pattern += '|'; | |
} | |
pattern[size(pattern) - 1] = ')'; | |
return pattern; | |
} | |
static auto classRange(int start, int last) { | |
assert(0 <= start && start <= last); | |
auto classRange = hex(start); | |
if (start < last) { | |
if (start + 1 < last) | |
classRange += '-'; | |
classRange += hex(last); | |
} | |
return classRange; | |
} | |
static auto pattern(std::set<int> const& set) { | |
assert(!empty(set)); | |
std::vector<std::string> atoms; | |
if (size(set) == 1) | |
atoms.emplace_back(hex(*begin(set))); | |
else { | |
std::string characterClass{ '[' }; | |
int start = -1, last = -1; | |
for (auto code : set) | |
if (start == -1) | |
start = last = code; | |
else if (code == last + 1) | |
last = code; | |
else { | |
characterClass += classRange(start, last); | |
start = last = code; | |
} | |
characterClass += classRange(start, last); | |
characterClass += ']'; | |
atoms.emplace_back(characterClass); | |
} | |
return atoms; | |
} | |
template<class Map> | |
static auto pattern(Map const& map) { | |
std::map<std::string, std::set<int>> group; | |
for (auto [code, map] : map) | |
group[join(pattern(map))].emplace(code); | |
std::map<int, std::string> sort; | |
for (auto [code, set] : group) | |
sort.emplace(*begin(set), join(pattern(set)) + code); | |
std::vector<std::string> result; | |
for (auto [_, atom] : sort) | |
result.emplace_back(atom); | |
return result; | |
} | |
template<class Vector, class Map, class = std::enable_if_t<std::is_same_v<std::remove_cvref_t<Vector>, std::vector<std::string>>>> | |
static inline auto& operator<<(Vector&& atoms, Map const& map) { | |
if (!empty(map)) { | |
auto patterns = pattern(map); | |
atoms.insert(end(atoms), begin(patterns), end(patterns)); | |
} | |
return atoms; | |
} | |
static auto regex(std::set<int> const& codes) { | |
std::set<int> u1; | |
std::map<int, std::set<int>> u2; | |
std::map<int, std::map<int, std::set<int>>> u3; | |
std::map<int, std::map<int, std::map<int, std::set<int>>>> u4; | |
for (auto code : codes) { | |
assert(0 <= code && code <= 0x10'FFFF); | |
if (code < 0x80) | |
u1.emplace(code); | |
else if (code < 0x800) | |
u2[0xC0 | code >> 6].emplace(0x80 | code & 0x3F); | |
else if (code < 0x1'0000) | |
u3[0xE0 | code >> 12][0x80 | code >> 6 & 0x3F].emplace(0x80 | code & 0x3F); | |
else | |
u4[0xF0 | code >> 18][0x80 | code >> 12 & 0x3F][0x80 | code >> 6 & 0x3F].emplace(0x80 | code & 0x3F); | |
} | |
return join(std::vector<std::string>{} << u1 << u2 << u3 << u4); | |
} | |
int main() { | |
std::set<int> nfc, nfd; | |
{ | |
std::regex re{ R"(^([0-9A-F]+);(?:[^;]*;){4}[0-9A-F]+(?: ([0-9A-F]+))?;)", std::regex_constants::optimize }; | |
// https://www.unicode.org/Public/UCD/latest/ucd/ | |
std::ifstream ud{ "UnicodeData.txt" }; | |
for (std::string line; getline(ud, line);) | |
if (std::smatch m; std::regex_search(line, m, re)) | |
// https://developer.apple.com/library/archive/qa/qa1173/ | |
if (auto code = hex(m[1]); !(0x2000 <= code && code <= 0x2FFF || 0xF900 <= code && code <= 0xFAFF || 0x2F800 <= code && code <= 0x2FAFF)) { | |
nfc.emplace(code); | |
if (m[2].matched) | |
nfd.emplace(hex(m[2])); | |
} | |
} | |
std::cout << "NFC:"sv << std::endl << regex(nfc) << std::endl; | |
std::cout << "NFD:"sv << std::endl << regex(nfd) << std::endl; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NFC: | |
(?:\xc3[\x80-\x85\x87-\x8f\x91-\x96\x99-\x9d\xa0-\xa5\xa7-\xaf\xb1-\xb6\xb9-\xbd\xbf] | |
|\xc4[\x80-\x8f\x92-\xa5\xa8-\xb0\xb4-\xb7\xb9-\xbe] | |
|\xc5[\x83-\x88\x8c-\x91\x94-\xa5\xa8-\xbe] | |
|\xc6[\xa0\xa1\xaf\xb0] | |
|\xc7[\x8d-\x9c\x9e-\xa3\xa6-\xb0\xb4\xb5\xb8-\xbf] | |
|\xc8[\x80-\x9b\x9e\x9f\xa6-\xb3] | |
|\xcd[\x80\x81\x83\x84\xb4\xbe] | |
|\xce[\x85-\x8a\x8c\x8e-\x90\xaa-\xb0] | |
|\xcf[\x8a-\x8e\x93\x94] | |
|\xd0[\x80\x81\x83\x87\x8c-\x8e\x99\xb9] | |
|\xd1[\x90\x91\x93\x97\x9c-\x9e\xb6\xb7] | |
|\xd3[\x81\x82\x90-\x93\x96\x97\x9a-\x9f\xa2-\xa7\xaa-\xb5\xb8\xb9] | |
|\xd8[\xa2-\xa6] | |
|\xdb[\x80\x82\x93] | |
|\xe0(?:\xa4[\xa9\xb1\xb4] | |
|\xa5[\x98-\x9f] | |
|\xa7[\x8b\x8c\x9c\x9d\x9f] | |
|\xa8[\xb3\xb6] | |
|\xa9[\x99-\x9b\x9e] | |
|\xad[\x88\x8b\x8c\x9c\x9d] | |
|\xae\x94 | |
|[\xaf\xb5][\x8a-\x8c] | |
|\xb1\x88 | |
|\xb3[\x80\x87\x88\x8a\x8b] | |
|\xb7[\x9a\x9c-\x9e] | |
|\xbd[\x83\x8d\x92\x97\x9c\xa9\xb3\xb5\xb6\xb8] | |
|\xbe[\x81\x93\x9d\xa2\xa7\xac\xb9]) | |
|\xe1(?:\x80\xa6 | |
|\xac[\x86\x88\x8a\x8c\x8e\x92\xbb\xbd] | |
|\xad[\x80\x81\x83] | |
|[\xb8\xb9][\x80-\xbf] | |
|\xba[\x80-\x99\x9b\xa0-\xbf] | |
|\xbb[\x80-\xb9] | |
|\xbc[\x80-\x95\x98-\x9d\xa0-\xbf] | |
|\xbd[\x80-\x85\x88-\x8d\x90-\x97\x99\x9b\x9d\x9f-\xbd] | |
|\xbe[\x80-\xb4\xb6-\xbc\xbe] | |
|\xbf[\x81-\x84\x86-\x93\x96-\x9b\x9d-\xaf\xb2-\xb4\xb6-\xbd]) | |
|\xe3(?:\x81[\x8c\x8e\x90\x92\x94\x96\x98\x9a\x9c\x9e\xa0\xa2\xa5\xa7\xa9\xb0\xb1\xb3\xb4\xb6\xb7\xb9\xba\xbc\xbd] | |
|\x82[\x94\x9e\xac\xae\xb0\xb2\xb4\xb6\xb8\xba\xbc\xbe] | |
|\x83[\x80\x82\x85\x87\x89\x90\x91\x93\x94\x96\x97\x99\x9a\x9c\x9d\xb4\xb7-\xba\xbe]) | |
|\xef(?:\xac[\x9d\x9f\xaa-\xb6\xb8-\xbc\xbe] | |
|\xad[\x80\x81\x83\x84\x86-\x8e]) | |
|\xf0(?:\x91(?:\x82[\x9a\x9c\xab] | |
|\x84[\xae\xaf] | |
|\x8d[\x8b\x8c] | |
|\x92[\xbb\xbc\xbe] | |
|\x96[\xba\xbb]) | |
|\x9d(?:\x85[\x9e-\xa4] | |
|\x86[\xbb-\xbf] | |
|\x87\x80))) | |
NFD: | |
(?:\xcc[\x80-\x84\x86-\x8c\x8f\x91\x93\x94\x9b\xa3-\xa8\xad\xae\xb0\xb1] | |
|\xcd[\x82\x85] | |
|\xd6[\xb4\xb7-\xb9\xbc\xbf] | |
|\xd7[\x81\x82] | |
|\xd9[\x93-\x95] | |
|\xe0(?:[\xa4\xa8]\xbc | |
|[\xa6\xac][\xbc\xbe] | |
|[\xa7\xaf\xb5]\x97 | |
|\xad[\x96\x97] | |
|[\xae\xb4]\xbe | |
|\xb1\x96 | |
|\xb3[\x82\x95\x96] | |
|\xb7[\x8a\x8f\x9f] | |
|\xbd[\xb2\xb4] | |
|\xbe[\x80\xb5\xb7]) | |
|\xe1(?:\x80\xae|\xac\xb5) | |
|\xe3\x82[\x99\x9a] | |
|\xf0(?:\x91(?:\x82\xba | |
|\x84\xa7 | |
|\x8c\xbe | |
|\x8d\x97 | |
|\x92[\xb0\xba\xbd] | |
|\x96\xaf) | |
|\x9d\x85[\xa5\xae-\xb2])) | |
NFC: | |
00C0-00C5 | |
00C7-00CF | |
00D1-00D6 | |
00D9-00DD | |
00E0-00E5 | |
00E7-00EF | |
00F1-00F6 | |
00F9-00FD | |
00FF-010F | |
0112-0125 | |
0128-0130 | |
0134-0137 | |
0139-013E | |
0143-0148 | |
014C-0151 | |
0154-0165 | |
0168-017E | |
01A0 | |
01A1 | |
01AF | |
01B0 | |
01CD-01DC | |
01DE-01E3 | |
01E6-01F0 | |
01F4 | |
01F5 | |
01F8-021B | |
021E | |
021F | |
0226-0233 | |
0340 | |
0341 | |
0343 | |
0344 | |
0374 | |
037E | |
0385-038A | |
038C | |
038E-0390 | |
03AA-03B0 | |
03CA-03CE | |
03D3 | |
03D4 | |
0400 | |
0401 | |
0403 | |
0407 | |
040C-040E | |
0419 | |
0439 | |
0450 | |
0451 | |
0453 | |
0457 | |
045C-045E | |
0476 | |
0477 | |
04C1 | |
04C2 | |
04D0-04D3 | |
04D6 | |
04D7 | |
04DA-04DF | |
04E2-04E7 | |
04EA-04F5 | |
04F8 | |
04F9 | |
0622-0626 | |
06C0 | |
06C2 | |
06D3 | |
0929 | |
0931 | |
0934 | |
0958-095F | |
09CB | |
09CC | |
09DC | |
09DD | |
09DF | |
0A33 | |
0A36 | |
0A59-0A5B | |
0A5E | |
0B48 | |
0B4B | |
0B4C | |
0B5C | |
0B5D | |
0B94 | |
0BCA-0BCC | |
0C48 | |
0CC0 | |
0CC7 | |
0CC8 | |
0CCA | |
0CCB | |
0D4A-0D4C | |
0DDA | |
0DDC-0DDE | |
0F43 | |
0F4D | |
0F52 | |
0F57 | |
0F5C | |
0F69 | |
0F73 | |
0F75 | |
0F76 | |
0F78 | |
0F81 | |
0F93 | |
0F9D | |
0FA2 | |
0FA7 | |
0FAC | |
0FB9 | |
1026 | |
1B06 | |
1B08 | |
1B0A | |
1B0C | |
1B0E | |
1B12 | |
1B3B | |
1B3D | |
1B40 | |
1B41 | |
1B43 | |
1E00-1E99 | |
1E9B | |
1EA0-1EF9 | |
1F00-1F15 | |
1F18-1F1D | |
1F20-1F45 | |
1F48-1F4D | |
1F50-1F57 | |
1F59 | |
1F5B | |
1F5D | |
1F5F-1F7D | |
1F80-1FB4 | |
1FB6-1FBC | |
1FBE | |
1FC1-1FC4 | |
1FC6-1FD3 | |
1FD6-1FDB | |
1FDD-1FEF | |
1FF2-1FF4 | |
1FF6-1FFD | |
304C | |
304E | |
3050 | |
3052 | |
3054 | |
3056 | |
3058 | |
305A | |
305C | |
305E | |
3060 | |
3062 | |
3065 | |
3067 | |
3069 | |
3070 | |
3071 | |
3073 | |
3074 | |
3076 | |
3077 | |
3079 | |
307A | |
307C | |
307D | |
3094 | |
309E | |
30AC | |
30AE | |
30B0 | |
30B2 | |
30B4 | |
30B6 | |
30B8 | |
30BA | |
30BC | |
30BE | |
30C0 | |
30C2 | |
30C5 | |
30C7 | |
30C9 | |
30D0 | |
30D1 | |
30D3 | |
30D4 | |
30D6 | |
30D7 | |
30D9 | |
30DA | |
30DC | |
30DD | |
30F4 | |
30F7-30FA | |
30FE | |
FB1D | |
FB1F | |
FB2A-FB36 | |
FB38-FB3C | |
FB3E | |
FB40 | |
FB41 | |
FB43 | |
FB44 | |
FB46-FB4E | |
1109A | |
1109C | |
110AB | |
1112E | |
1112F | |
1134B | |
1134C | |
114BB | |
114BC | |
114BE | |
115BA | |
115BB | |
1D15E-1D164 | |
1D1BB-1D1C0 | |
NFD: | |
0300-0304 | |
0306-030C | |
030F | |
0311 | |
0313 | |
0314 | |
031B | |
0323-0328 | |
032D | |
032E | |
0330 | |
0331 | |
0342 | |
0345 | |
05B4 | |
05B7-05B9 | |
05BC | |
05BF | |
05C1 | |
05C2 | |
0653-0655 | |
093C | |
09BC | |
09BE | |
09D7 | |
0A3C | |
0B3C | |
0B3E | |
0B56 | |
0B57 | |
0BBE | |
0BD7 | |
0C56 | |
0CC2 | |
0CD5 | |
0CD6 | |
0D3E | |
0D57 | |
0DCA | |
0DCF | |
0DDF | |
0F72 | |
0F74 | |
0F80 | |
0FB5 | |
0FB7 | |
102E | |
1B35 | |
3099 | |
309A | |
110BA | |
11127 | |
1133E | |
11357 | |
114B0 | |
114BA | |
114BD | |
115AF | |
1D165 | |
1D16E-1D172 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment