Skip to content

Instantly share code, notes, and snippets.

@sayurin
Last active September 21, 2022 07:57
Show Gist options
  • Save sayurin/59977a7f2f9241e3c22c1b48c25d8fb6 to your computer and use it in GitHub Desktop.
Save sayurin/59977a7f2f9241e3c22c1b48c25d8fb6 to your computer and use it in GitHub Desktop.
HFS+に出現しない文字、出現する文字
#include <charconv>
#include <fstream>
#include <iostream>
#include <map>
#include <regex>
#include <set>
#include <string>
#include <string_view>
#include <cassert>
using namespace std::literals;
static auto hex(std::ssub_match const& sm) {
int code;
std::from_chars(&*sm.first, &*sm.second, code, 16);
return code;
}
static auto hex(int code) {
assert(0 <= code && code < 0x100);
std::string hex{ R"(\x00)"sv };
std::to_chars(data(hex) + (code < 0x10 ? 3 : 2), data(hex) + size(hex), code, 16);
return hex;
}
static auto join(std::vector<std::string> const& terms) {
if (size(terms) == 1)
return terms.front();
std::string pattern{ "(?:"sv };
for (auto const& term : terms) {
pattern += term;
pattern += '|';
}
pattern[size(pattern) - 1] = ')';
return pattern;
}
static auto classRange(int start, int last) {
assert(0 <= start && start <= last);
auto classRange = hex(start);
if (start < last) {
if (start + 1 < last)
classRange += '-';
classRange += hex(last);
}
return classRange;
}
static auto pattern(std::set<int> const& set) {
assert(!empty(set));
std::vector<std::string> atoms;
if (size(set) == 1)
atoms.emplace_back(hex(*begin(set)));
else {
std::string characterClass{ '[' };
int start = -1, last = -1;
for (auto code : set)
if (start == -1)
start = last = code;
else if (code == last + 1)
last = code;
else {
characterClass += classRange(start, last);
start = last = code;
}
characterClass += classRange(start, last);
characterClass += ']';
atoms.emplace_back(characterClass);
}
return atoms;
}
template<class Map>
static auto pattern(Map const& map) {
std::map<std::string, std::set<int>> group;
for (auto [code, map] : map)
group[join(pattern(map))].emplace(code);
std::map<int, std::string> sort;
for (auto [code, set] : group)
sort.emplace(*begin(set), join(pattern(set)) + code);
std::vector<std::string> result;
for (auto [_, atom] : sort)
result.emplace_back(atom);
return result;
}
template<class Vector, class Map, class = std::enable_if_t<std::is_same_v<std::remove_cvref_t<Vector>, std::vector<std::string>>>>
static inline auto& operator<<(Vector&& atoms, Map const& map) {
if (!empty(map)) {
auto patterns = pattern(map);
atoms.insert(end(atoms), begin(patterns), end(patterns));
}
return atoms;
}
static auto regex(std::set<int> const& codes) {
std::set<int> u1;
std::map<int, std::set<int>> u2;
std::map<int, std::map<int, std::set<int>>> u3;
std::map<int, std::map<int, std::map<int, std::set<int>>>> u4;
for (auto code : codes) {
assert(0 <= code && code <= 0x10'FFFF);
if (code < 0x80)
u1.emplace(code);
else if (code < 0x800)
u2[0xC0 | code >> 6].emplace(0x80 | code & 0x3F);
else if (code < 0x1'0000)
u3[0xE0 | code >> 12][0x80 | code >> 6 & 0x3F].emplace(0x80 | code & 0x3F);
else
u4[0xF0 | code >> 18][0x80 | code >> 12 & 0x3F][0x80 | code >> 6 & 0x3F].emplace(0x80 | code & 0x3F);
}
return join(std::vector<std::string>{} << u1 << u2 << u3 << u4);
}
int main() {
std::set<int> nfc, nfd;
{
std::regex re{ R"(^([0-9A-F]+);(?:[^;]*;){4}[0-9A-F]+(?: ([0-9A-F]+))?;)", std::regex_constants::optimize };
// https://www.unicode.org/Public/UCD/latest/ucd/
std::ifstream ud{ "UnicodeData.txt" };
for (std::string line; getline(ud, line);)
if (std::smatch m; std::regex_search(line, m, re))
// https://developer.apple.com/library/archive/qa/qa1173/
if (auto code = hex(m[1]); !(0x2000 <= code && code <= 0x2FFF || 0xF900 <= code && code <= 0xFAFF || 0x2F800 <= code && code <= 0x2FAFF)) {
nfc.emplace(code);
if (m[2].matched)
nfd.emplace(hex(m[2]));
}
}
std::cout << "NFC:"sv << std::endl << regex(nfc) << std::endl;
std::cout << "NFD:"sv << std::endl << regex(nfd) << std::endl;
}
NFC:
(?:\xc3[\x80-\x85\x87-\x8f\x91-\x96\x99-\x9d\xa0-\xa5\xa7-\xaf\xb1-\xb6\xb9-\xbd\xbf]
|\xc4[\x80-\x8f\x92-\xa5\xa8-\xb0\xb4-\xb7\xb9-\xbe]
|\xc5[\x83-\x88\x8c-\x91\x94-\xa5\xa8-\xbe]
|\xc6[\xa0\xa1\xaf\xb0]
|\xc7[\x8d-\x9c\x9e-\xa3\xa6-\xb0\xb4\xb5\xb8-\xbf]
|\xc8[\x80-\x9b\x9e\x9f\xa6-\xb3]
|\xcd[\x80\x81\x83\x84\xb4\xbe]
|\xce[\x85-\x8a\x8c\x8e-\x90\xaa-\xb0]
|\xcf[\x8a-\x8e\x93\x94]
|\xd0[\x80\x81\x83\x87\x8c-\x8e\x99\xb9]
|\xd1[\x90\x91\x93\x97\x9c-\x9e\xb6\xb7]
|\xd3[\x81\x82\x90-\x93\x96\x97\x9a-\x9f\xa2-\xa7\xaa-\xb5\xb8\xb9]
|\xd8[\xa2-\xa6]
|\xdb[\x80\x82\x93]
|\xe0(?:\xa4[\xa9\xb1\xb4]
|\xa5[\x98-\x9f]
|\xa7[\x8b\x8c\x9c\x9d\x9f]
|\xa8[\xb3\xb6]
|\xa9[\x99-\x9b\x9e]
|\xad[\x88\x8b\x8c\x9c\x9d]
|\xae\x94
|[\xaf\xb5][\x8a-\x8c]
|\xb1\x88
|\xb3[\x80\x87\x88\x8a\x8b]
|\xb7[\x9a\x9c-\x9e]
|\xbd[\x83\x8d\x92\x97\x9c\xa9\xb3\xb5\xb6\xb8]
|\xbe[\x81\x93\x9d\xa2\xa7\xac\xb9])
|\xe1(?:\x80\xa6
|\xac[\x86\x88\x8a\x8c\x8e\x92\xbb\xbd]
|\xad[\x80\x81\x83]
|[\xb8\xb9][\x80-\xbf]
|\xba[\x80-\x99\x9b\xa0-\xbf]
|\xbb[\x80-\xb9]
|\xbc[\x80-\x95\x98-\x9d\xa0-\xbf]
|\xbd[\x80-\x85\x88-\x8d\x90-\x97\x99\x9b\x9d\x9f-\xbd]
|\xbe[\x80-\xb4\xb6-\xbc\xbe]
|\xbf[\x81-\x84\x86-\x93\x96-\x9b\x9d-\xaf\xb2-\xb4\xb6-\xbd])
|\xe3(?:\x81[\x8c\x8e\x90\x92\x94\x96\x98\x9a\x9c\x9e\xa0\xa2\xa5\xa7\xa9\xb0\xb1\xb3\xb4\xb6\xb7\xb9\xba\xbc\xbd]
|\x82[\x94\x9e\xac\xae\xb0\xb2\xb4\xb6\xb8\xba\xbc\xbe]
|\x83[\x80\x82\x85\x87\x89\x90\x91\x93\x94\x96\x97\x99\x9a\x9c\x9d\xb4\xb7-\xba\xbe])
|\xef(?:\xac[\x9d\x9f\xaa-\xb6\xb8-\xbc\xbe]
|\xad[\x80\x81\x83\x84\x86-\x8e])
|\xf0(?:\x91(?:\x82[\x9a\x9c\xab]
|\x84[\xae\xaf]
|\x8d[\x8b\x8c]
|\x92[\xbb\xbc\xbe]
|\x96[\xba\xbb])
|\x9d(?:\x85[\x9e-\xa4]
|\x86[\xbb-\xbf]
|\x87\x80)))
NFD:
(?:\xcc[\x80-\x84\x86-\x8c\x8f\x91\x93\x94\x9b\xa3-\xa8\xad\xae\xb0\xb1]
|\xcd[\x82\x85]
|\xd6[\xb4\xb7-\xb9\xbc\xbf]
|\xd7[\x81\x82]
|\xd9[\x93-\x95]
|\xe0(?:[\xa4\xa8]\xbc
|[\xa6\xac][\xbc\xbe]
|[\xa7\xaf\xb5]\x97
|\xad[\x96\x97]
|[\xae\xb4]\xbe
|\xb1\x96
|\xb3[\x82\x95\x96]
|\xb7[\x8a\x8f\x9f]
|\xbd[\xb2\xb4]
|\xbe[\x80\xb5\xb7])
|\xe1(?:\x80\xae|\xac\xb5)
|\xe3\x82[\x99\x9a]
|\xf0(?:\x91(?:\x82\xba
|\x84\xa7
|\x8c\xbe
|\x8d\x97
|\x92[\xb0\xba\xbd]
|\x96\xaf)
|\x9d\x85[\xa5\xae-\xb2]))
NFC:
00C0-00C5
00C7-00CF
00D1-00D6
00D9-00DD
00E0-00E5
00E7-00EF
00F1-00F6
00F9-00FD
00FF-010F
0112-0125
0128-0130
0134-0137
0139-013E
0143-0148
014C-0151
0154-0165
0168-017E
01A0
01A1
01AF
01B0
01CD-01DC
01DE-01E3
01E6-01F0
01F4
01F5
01F8-021B
021E
021F
0226-0233
0340
0341
0343
0344
0374
037E
0385-038A
038C
038E-0390
03AA-03B0
03CA-03CE
03D3
03D4
0400
0401
0403
0407
040C-040E
0419
0439
0450
0451
0453
0457
045C-045E
0476
0477
04C1
04C2
04D0-04D3
04D6
04D7
04DA-04DF
04E2-04E7
04EA-04F5
04F8
04F9
0622-0626
06C0
06C2
06D3
0929
0931
0934
0958-095F
09CB
09CC
09DC
09DD
09DF
0A33
0A36
0A59-0A5B
0A5E
0B48
0B4B
0B4C
0B5C
0B5D
0B94
0BCA-0BCC
0C48
0CC0
0CC7
0CC8
0CCA
0CCB
0D4A-0D4C
0DDA
0DDC-0DDE
0F43
0F4D
0F52
0F57
0F5C
0F69
0F73
0F75
0F76
0F78
0F81
0F93
0F9D
0FA2
0FA7
0FAC
0FB9
1026
1B06
1B08
1B0A
1B0C
1B0E
1B12
1B3B
1B3D
1B40
1B41
1B43
1E00-1E99
1E9B
1EA0-1EF9
1F00-1F15
1F18-1F1D
1F20-1F45
1F48-1F4D
1F50-1F57
1F59
1F5B
1F5D
1F5F-1F7D
1F80-1FB4
1FB6-1FBC
1FBE
1FC1-1FC4
1FC6-1FD3
1FD6-1FDB
1FDD-1FEF
1FF2-1FF4
1FF6-1FFD
304C
304E
3050
3052
3054
3056
3058
305A
305C
305E
3060
3062
3065
3067
3069
3070
3071
3073
3074
3076
3077
3079
307A
307C
307D
3094
309E
30AC
30AE
30B0
30B2
30B4
30B6
30B8
30BA
30BC
30BE
30C0
30C2
30C5
30C7
30C9
30D0
30D1
30D3
30D4
30D6
30D7
30D9
30DA
30DC
30DD
30F4
30F7-30FA
30FE
FB1D
FB1F
FB2A-FB36
FB38-FB3C
FB3E
FB40
FB41
FB43
FB44
FB46-FB4E
1109A
1109C
110AB
1112E
1112F
1134B
1134C
114BB
114BC
114BE
115BA
115BB
1D15E-1D164
1D1BB-1D1C0
NFD:
0300-0304
0306-030C
030F
0311
0313
0314
031B
0323-0328
032D
032E
0330
0331
0342
0345
05B4
05B7-05B9
05BC
05BF
05C1
05C2
0653-0655
093C
09BC
09BE
09D7
0A3C
0B3C
0B3E
0B56
0B57
0BBE
0BD7
0C56
0CC2
0CD5
0CD6
0D3E
0D57
0DCA
0DCF
0DDF
0F72
0F74
0F80
0FB5
0FB7
102E
1B35
3099
309A
110BA
11127
1133E
11357
114B0
114BA
114BD
115AF
1D165
1D16E-1D172
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment