Skip to content

Instantly share code, notes, and snippets.

@musyoku
Created July 5, 2017 15:04
Show Gist options
  • Save musyoku/ad95704c90c77116fa4bd3ea3a746349 to your computer and use it in GitHub Desktop.
Save musyoku/ad95704c90c77116fa4bd3ea3a746349 to your computer and use it in GitHub Desktop.
import re
text = "BASIC_LATIN U+0000 U+007F (128)\n" \
"LATIN_1_SUPPLEMENT U+0080 U+00FF (128)\n" \
"LATIN_EXTENDED_A U+0100 U+017F (128)\n" \
"LATIN_EXTENDED_B U+0180 U+024F (208)\n" \
"IPA_EXTENSIONS U+0250 U+02AF (96)\n" \
"SPACING_MODIFIER_LETTERS U+02B0 U+02FF (80)\n" \
"COMBINING_DIACRITICAL_MARKS U+0300 U+036F (112)\n" \
"GREEK_AND_COPTIC U+0370 U+03FF (135)\n" \
"CYRILLIC U+0400 U+04FF (256)\n" \
"CYRILLIC_SUPPLEMENT U+0500 U+052F (48)\n" \
"ARMENIAN U+0530 U+058F (89)\n" \
"HEBREW U+0590 U+05FF (87)\n" \
"ARABIC U+0600 U+06FF (255)\n" \
"SYRIAC U+0700 U+074F (77)\n" \
"ARABIC_SUPPLEMENT U+0750 U+077F (48)\n" \
"THAANA U+0780 U+07BF (50)\n" \
"NKO U+07C0 U+07FF (59)\n" \
"SAMARITAN U+0800 U+083F (61)\n" \
"MANDAIC U+0840 U+085F (29)\n" \
"SYRIAC_SUPPLEMENT U+0860 U+086F (11)\n" \
"ARABIC_EXTENDED_A U+08A0 U+08FF (73)\n" \
"DEVANAGARI U+0900 U+097F (128)\n" \
"BENGALI U+0980 U+09FF (95)\n" \
"GURMUKHI U+0A00 U+0A7F (79)\n" \
"GUJARATI U+0A80 U+0AFF (91)\n" \
"ORIYA U+0B00 U+0B7F (90)\n" \
"TAMIL U+0B80 U+0BFF (72)\n" \
"TELUGU U+0C00 U+0C7F (96)\n" \
"KANNADA U+0C80 U+0CFF (88)\n" \
"MALAYALAM U+0D00 U+0D7F (117)\n" \
"SINHALA U+0D80 U+0DFF (90)\n" \
"THAI U+0E00 U+0E7F (87)\n" \
"LAO U+0E80 U+0EFF (67)\n" \
"TIBETAN U+0F00 U+0FFF (211)\n" \
"MYANMAR U+1000 U+109F (160)\n" \
"GEORGIAN U+10A0 U+10FF (88)\n" \
"HANGUL_JAMO U+1100 U+11FF (256)\n" \
"ETHIOPIC U+1200 U+137F (358)\n" \
"ETHIOPIC_SUPPLEMENT U+1380 U+139F (26)\n" \
"CHEROKEE U+13A0 U+13FF (92)\n" \
"UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS U+1400 U+167F (640)\n" \
"OGHAM U+1680 U+169F (29)\n" \
"RUNIC U+16A0 U+16FF (89)\n" \
"TAGALOG U+1700 U+171F (20)\n" \
"HANUNOO U+1720 U+173F (23)\n" \
"BUHID U+1740 U+175F (20)\n" \
"TAGBANWA U+1760 U+177F (18)\n" \
"KHMER U+1780 U+17FF (114)\n" \
"MONGOLIAN U+1800 U+18AF (156)\n" \
"UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED U+18B0 U+18FF (70)\n" \
"LIMBU U+1900 U+194F (68)\n" \
"TAI_LE U+1950 U+197F (35)\n" \
"NEW_TAI_LUE U+1980 U+19DF (83)\n" \
"KHMER_SYMBOLS U+19E0 U+19FF (32)\n" \
"BUGINESE U+1A00 U+1A1F (30)\n" \
"TAI_THAM U+1A20 U+1AAF (127)\n" \
"COMBINING_DIACRITICAL_MARKS_EXTENDED U+1AB0 U+1AFF (15)\n" \
"BALINESE U+1B00 U+1B7F (121)\n" \
"SUNDANESE U+1B80 U+1BBF (64)\n" \
"BATAK U+1BC0 U+1BFF (56)\n" \
"LEPCHA U+1C00 U+1C4F (74)\n" \
"OL_CHIKI U+1C50 U+1C7F (48)\n" \
"CYRILLIC_EXTENDED_C U+1C80 U+1C8F (9)\n" \
"SUNDANESE_SUPPLEMENT U+1CC0 U+1CCF (8)\n" \
"VEDIC_EXTENSIONS U+1CD0 U+1CFF (42)\n" \
"PHONETIC_EXTENSIONS U+1D00 U+1D7F (128)\n" \
"PHONETIC_EXTENSIONS_SUPPLEMENT U+1D80 U+1DBF (64)\n" \
"COMBINING_DIACRITICAL_MARKS_SUPPLEMENT U+1DC0 U+1DFF (63)\n" \
"LATIN_EXTENDED_ADDITIONAL U+1E00 U+1EFF (256)\n" \
"GREEK_EXTENDED U+1F00 U+1FFF (233)\n" \
"GENERAL_PUNCTUATION U+2000 U+206F (111)\n" \
"SUPERSCRIPTS_AND_SUBSCRIPTS U+2070 U+209F (42)\n" \
"CURRENCY_SYMBOLS U+20A0 U+20CF (32)\n" \
"COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS U+20D0 U+20FF (33)\n" \
"LETTERLIKE_SYMBOLS U+2100 U+214F (80)\n" \
"NUMBER_FORMS U+2150 U+218F (60)\n" \
"ARROWS U+2190 U+21FF (112)\n" \
"MATHEMATICAL_OPERATORS U+2200 U+22FF (256)\n" \
"MISCELLANEOUS_TECHNICAL U+2300 U+23FF (256)\n" \
"CONTROL_PICTURES U+2400 U+243F (39)\n" \
"OPTICAL_CHARACTER_RECOGNITION U+2440 U+245F (11)\n" \
"ENCLOSED_ALPHANUMERICS U+2460 U+24FF (160)\n" \
"BOX_DRAWING U+2500 U+257F (128)\n" \
"BLOCK_ELEMENTS U+2580 U+259F (32)\n" \
"GEOMETRIC_SHAPES U+25A0 U+25FF (96)\n" \
"MISCELLANEOUS_SYMBOLS U+2600 U+26FF (256)\n" \
"DINGBATS U+2700 U+27BF (192)\n" \
"MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A U+27C0 U+27EF (48)\n" \
"SUPPLEMENTAL_ARROWS_A U+27F0 U+27FF (16)\n" \
"BRAILLE_PATTERNS U+2800 U+28FF (256)\n" \
"SUPPLEMENTAL_ARROWS_B U+2900 U+297F (128)\n" \
"MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B U+2980 U+29FF (128)\n" \
"SUPPLEMENTAL_MATHEMATICAL_OPERATORS U+2A00 U+2AFF (256)\n" \
"MISCELLANEOUS_SYMBOLS_AND_ARROWS U+2B00 U+2BFF (207)\n" \
"GLAGOLITIC U+2C00 U+2C5F (94)\n" \
"LATIN_EXTENDED_C U+2C60 U+2C7F (32)\n" \
"COPTIC U+2C80 U+2CFF (123)\n" \
"GEORGIAN_SUPPLEMENT U+2D00 U+2D2F (40)\n" \
"TIFINAGH U+2D30 U+2D7F (59)\n" \
"ETHIOPIC_EXTENDED U+2D80 U+2DDF (79)\n" \
"CYRILLIC_EXTENDED_A U+2DE0 U+2DFF (32)\n" \
"SUPPLEMENTAL_PUNCTUATION U+2E00 U+2E7F (74)\n" \
"CJK_RADICALS_SUPPLEMENT U+2E80 U+2EFF (115)\n" \
"KANGXI_RADICALS U+2F00 U+2FDF (214)\n" \
"IDEOGRAPHIC_DESCRIPTION_CHARACTERS U+2FF0 U+2FFF (12)\n" \
"CJK_SYMBOLS_AND_PUNCTUATION U+3000 U+303F (64)\n" \
"HIRAGANA U+3040 U+309F (93)\n" \
"KATAKANA U+30A0 U+30FF (96)\n" \
"BOPOMOFO U+3100 U+312F (42)\n" \
"HANGUL_COMPATIBILITY_JAMO U+3130 U+318F (94)\n" \
"KANBUN U+3190 U+319F (16)\n" \
"BOPOMOFO_EXTENDED U+31A0 U+31BF (27)\n" \
"CJK_STROKES U+31C0 U+31EF (36)\n" \
"KATAKANA_PHONETIC_EXTENSIONS U+31F0 U+31FF (16)\n" \
"ENCLOSED_CJK_LETTERS_AND_MONTHS U+3200 U+32FF (254)\n" \
"CJK_COMPATIBILITY U+3300 U+33FF (256)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A U+3400 U+4DBF (6192)\n" \
"YIJING_HEXAGRAM_SYMBOLS U+4DC0 U+4DFF (64)\n" \
"CJK_UNIFIED_IDEOGRAPHS U+4E00 U+9FFF (20942)\n" \
"YI_SYLLABLES U+A000 U+A48F (1165)\n" \
"YI_RADICALS U+A490 U+A4CF (55)\n" \
"LISU U+A4D0 U+A4FF (48)\n" \
"VAI U+A500 U+A63F (300)\n" \
"CYRILLIC_EXTENDED_B U+A640 U+A69F (96)\n" \
"BAMUM U+A6A0 U+A6FF (88)\n" \
"MODIFIER_TONE_LETTERS U+A700 U+A71F (32)\n" \
"LATIN_EXTENDED_D U+A720 U+A7FF (160)\n" \
"SYLOTI_NAGRI U+A800 U+A82F (44)\n" \
"COMMON_INDIC_NUMBER_FORMS U+A830 U+A83F (10)\n" \
"PHAGS_PA U+A840 U+A87F (56)\n" \
"SAURASHTRA U+A880 U+A8DF (82)\n" \
"DEVANAGARI_EXTENDED U+A8E0 U+A8FF (30)\n" \
"KAYAH_LI U+A900 U+A92F (48)\n" \
"REJANG U+A930 U+A95F (37)\n" \
"HANGUL_JAMO_EXTENDED_A U+A960 U+A97F (29)\n" \
"JAVANESE U+A980 U+A9DF (91)\n" \
"MYANMAR_EXTENDED_B U+A9E0 U+A9FF (31)\n" \
"CHAM U+AA00 U+AA5F (83)\n" \
"MYANMAR_EXTENDED_A U+AA60 U+AA7F (32)\n" \
"TAI_VIET U+AA80 U+AADF (72)\n" \
"MEETEI_MAYEK_EXTENSIONS U+AAE0 U+AAFF (23)\n" \
"ETHIOPIC_EXTENDED_A U+AB00 U+AB2F (32)\n" \
"LATIN_EXTENDED_E U+AB30 U+AB6F (54)\n" \
"CHEROKEE_SUPPLEMENT U+AB70 U+ABBF (80)\n" \
"MEETEI_MAYEK U+ABC0 U+ABFF (56)\n" \
"HANGUL_SYLLABLES U+AC00 U+D7AF (2)\n" \
"HANGUL_JAMO_EXTENDED_B U+D7B0 U+D7FF (72)\n" \
"HIGH_SURROGATES U+D800 U+DB7F (2)\n" \
"HIGH_PRIVATE_USE_SURROGATES U+DB80 U+DBFF (2)\n" \
"LOW_SURROGATES U+DC00 U+DFFF (2)\n" \
"PRIVATE_USE_AREA U+E000 U+F8FF (2)\n" \
"CJK_COMPATIBILITY_IDEOGRAPHS U+F900 U+FAFF (472)\n" \
"ALPHABETIC_PRESENTATION_FORMS U+FB00 U+FB4F (58)\n" \
"ARABIC_PRESENTATION_FORMS_A U+FB50 U+FDFF (643)\n" \
"VARIATION_SELECTORS U+FE00 U+FE0F (16)\n" \
"VERTICAL_FORMS U+FE10 U+FE1F (10)\n" \
"COMBINING_HALF_MARKS U+FE20 U+FE2F (16)\n" \
"CJK_COMPATIBILITY_FORMS U+FE30 U+FE4F (32)\n" \
"SMALL_FORM_VARIANTS U+FE50 U+FE6F (26)\n" \
"ARABIC_PRESENTATION_FORMS_B U+FE70 U+FEFF (141)\n" \
"HALFWIDTH_AND_FULLWIDTH_FORMS U+FF00 U+FFEF (225)\n" \
"SPECIALS U+FFF0 U+FFFF (7)\n" \
"LINEAR_B_SYLLABARY U+10000 U+1007F (88)\n" \
"LINEAR_B_IDEOGRAMS U+10080 U+100FF (123)\n" \
"AEGEAN_NUMBERS U+10100 U+1013F (57)\n" \
"ANCIENT_GREEK_NUMBERS U+10140 U+1018F (79)\n" \
"ANCIENT_SYMBOLS U+10190 U+101CF (13)\n" \
"PHAISTOS_DISC U+101D0 U+101FF (46)\n" \
"LYCIAN U+10280 U+1029F (29)\n" \
"CARIAN U+102A0 U+102DF (49)\n" \
"COPTIC_EPACT_NUMBERS U+102E0 U+102FF (28)\n" \
"OLD_ITALIC U+10300 U+1032F (39)\n" \
"GOTHIC U+10330 U+1034F (27)\n" \
"OLD_PERMIC U+10350 U+1037F (43)\n" \
"UGARITIC U+10380 U+1039F (31)\n" \
"OLD_PERSIAN U+103A0 U+103DF (50)\n" \
"DESERET U+10400 U+1044F (80)\n" \
"SHAVIAN U+10450 U+1047F (48)\n" \
"OSMANYA U+10480 U+104AF (40)\n" \
"OSAGE U+104B0 U+104FF (72)\n" \
"ELBASAN U+10500 U+1052F (40)\n" \
"CAUCASIAN_ALBANIAN U+10530 U+1056F (53)\n" \
"LINEAR_A U+10600 U+1077F (341)\n" \
"CYPRIOT_SYLLABARY U+10800 U+1083F (55)\n" \
"IMPERIAL_ARAMAIC U+10840 U+1085F (31)\n" \
"PALMYRENE U+10860 U+1087F (32)\n" \
"NABATAEAN U+10880 U+108AF (40)\n" \
"HATRAN U+108E0 U+108FF (26)\n" \
"PHOENICIAN U+10900 U+1091F (29)\n" \
"LYDIAN U+10920 U+1093F (27)\n" \
"MEROITIC_HIEROGLYPHS U+10980 U+1099F (32)\n" \
"MEROITIC_CURSIVE U+109A0 U+109FF (90)\n" \
"KHAROSHTHI U+10A00 U+10A5F (65)\n" \
"OLD_SOUTH_ARABIAN U+10A60 U+10A7F (32)\n" \
"OLD_NORTH_ARABIAN U+10A80 U+10A9F (32)\n" \
"MANICHAEAN U+10AC0 U+10AFF (51)\n" \
"AVESTAN U+10B00 U+10B3F (61)\n" \
"INSCRIPTIONAL_PARTHIAN U+10B40 U+10B5F (30)\n" \
"INSCRIPTIONAL_PAHLAVI U+10B60 U+10B7F (27)\n" \
"PSALTER_PAHLAVI U+10B80 U+10BAF (29)\n" \
"OLD_TURKIC U+10C00 U+10C4F (73)\n" \
"OLD_HUNGARIAN U+10C80 U+10CFF (108)\n" \
"RUMI_NUMERAL_SYMBOLS U+10E60 U+10E7F (31)\n" \
"BRAHMI U+11000 U+1107F (109)\n" \
"KAITHI U+11080 U+110CF (66)\n" \
"SORA_SOMPENG U+110D0 U+110FF (35)\n" \
"CHAKMA U+11100 U+1114F (67)\n" \
"MAHAJANI U+11150 U+1117F (39)\n" \
"SHARADA U+11180 U+111DF (94)\n" \
"SINHALA_ARCHAIC_NUMBERS U+111E0 U+111FF (20)\n" \
"KHOJKI U+11200 U+1124F (62)\n" \
"MULTANI U+11280 U+112AF (38)\n" \
"KHUDAWADI U+112B0 U+112FF (69)\n" \
"GRANTHA U+11300 U+1137F (85)\n" \
"NEWA U+11400 U+1147F (92)\n" \
"TIRHUTA U+11480 U+114DF (82)\n" \
"SIDDHAM U+11580 U+115FF (92)\n" \
"MODI U+11600 U+1165F (79)\n" \
"MONGOLIAN_SUPPLEMENT U+11660 U+1167F (13)\n" \
"TAKRI U+11680 U+116CF (66)\n" \
"AHOM U+11700 U+1173F (57)\n" \
"WARANG_CITI U+118A0 U+118FF (84)\n" \
"ZANABAZAR_SQUARE U+11A00 U+11A4F (72)\n" \
"SOYOMBO U+11A50 U+11AAF (80)\n" \
"PAU_CIN_HAU U+11AC0 U+11AFF (57)\n" \
"BHAIKSUKI U+11C00 U+11C6F (97)\n" \
"MARCHEN U+11C70 U+11CBF (68)\n" \
"MASARAM_GONDI U+11D00 U+11D5F (75)\n" \
"CUNEIFORM U+12000 U+123FF (922)\n" \
"CUNEIFORM_NUMBERS_AND_PUNCTUATION U+12400 U+1247F (116)\n" \
"EARLY_DYNASTIC_CUNEIFORM U+12480 U+1254F (196)\n" \
"EGYPTIAN_HIEROGLYPHS U+13000 U+1342F (1071)\n" \
"ANATOLIAN_HIEROGLYPHS U+14400 U+1467F (583)\n" \
"BAMUM_SUPPLEMENT U+16800 U+16A3F (569)\n" \
"MRO U+16A40 U+16A6F (43)\n" \
"BASSA_VAH U+16AD0 U+16AFF (36)\n" \
"PAHAWH_HMONG U+16B00 U+16B8F (127)\n" \
"MIAO U+16F00 U+16F9F (133)\n" \
"IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION U+16FE0 U+16FFF (2)\n" \
"TANGUT U+17000 U+187FF (2)\n" \
"TANGUT_COMPONENTS U+18800 U+18AFF (755)\n" \
"KANA_SUPPLEMENT U+1B000 U+1B0FF (256)\n" \
"KANA_EXTENDED_A U+1B100 U+1B12F (31)\n" \
"NUSHU U+1B170 U+1B2FF (396)\n" \
"DUPLOYAN U+1BC00 U+1BC9F (143)\n" \
"SHORTHAND_FORMAT_CONTROLS U+1BCA0 U+1BCAF (4)\n" \
"BYZANTINE_MUSICAL_SYMBOLS U+1D000 U+1D0FF (246)\n" \
"MUSICAL_SYMBOLS U+1D100 U+1D1FF (231)\n" \
"ANCIENT_GREEK_MUSICAL_NOTATION U+1D200 U+1D24F (70)\n" \
"TAI_XUAN_JING_SYMBOLS U+1D300 U+1D35F (87)\n" \
"COUNTING_ROD_NUMERALS U+1D360 U+1D37F (18)\n" \
"MATHEMATICAL_ALPHANUMERIC_SYMBOLS U+1D400 U+1D7FF (996)\n" \
"SUTTON_SIGNWRITING U+1D800 U+1DAAF (672)\n" \
"GLAGOLITIC_SUPPLEMENT U+1E000 U+1E02F (38)\n" \
"MENDE_KIKAKUI U+1E800 U+1E8DF (213)\n" \
"ADLAM U+1E900 U+1E95F (87)\n" \
"ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS U+1EE00 U+1EEFF (143)\n" \
"MAHJONG_TILES U+1F000 U+1F02F (44)\n" \
"DOMINO_TILES U+1F030 U+1F09F (100)\n" \
"PLAYING_CARDS U+1F0A0 U+1F0FF (82)\n" \
"ENCLOSED_ALPHANUMERIC_SUPPLEMENT U+1F100 U+1F1FF (191)\n" \
"ENCLOSED_IDEOGRAPHIC_SUPPLEMENT U+1F200 U+1F2FF (64)\n" \
"MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS U+1F300 U+1F5FF (768)\n" \
"EMOTICONS U+1F600 U+1F64F (80)\n" \
"ORNAMENTAL_DINGBATS U+1F650 U+1F67F (48)\n" \
"TRANSPORT_AND_MAP_SYMBOLS U+1F680 U+1F6FF (107)\n" \
"ALCHEMICAL_SYMBOLS U+1F700 U+1F77F (116)\n" \
"GEOMETRIC_SHAPES_EXTENDED U+1F780 U+1F7FF (85)\n" \
"SUPPLEMENTAL_ARROWS_C U+1F800 U+1F8FF (148)\n" \
"SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS U+1F900 U+1F9FF (148)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B U+20000 U+2A6DF (42676)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C U+2A700 U+2B73F (60)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D U+2B740 U+2B81F (27)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E U+2B820 U+2CEAF (6)\n" \
"CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F U+2CEB0 U+2EBEF (2)\n" \
"CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT U+2F800 U+2FA1F (542)\n" \
"TAGS U+E0000 U+E007F (97)\n" \
"VARIATION_SELECTORS_SUPPLEMENT U+E0100 U+E01EF (240)\n" \
"SUPPLEMENTARY_PRIVATE_USE_AREA_A U+F0000 U+FFFFF (4)\n" \
"SUPPLEMENTARY_PRIVATE_USE_AREA_B U+100000 U+10FFFF (4)\n" \
lines = text.split("\n")
ctypes = []
starts = []
ends = []
for line in lines:
m = re.match(r"([^\t]+) ([^\t]+) ([^\t]+) ([^\t]+)", line)
if m:
ctypes.append(m.group(1))
starts.append(m.group(2).replace("U+", "0x"))
ends.append(m.group(3).replace("U+", "0x"))
print("#define CTYPE_UNKNOWN 0")
for i, ctype in enumerate(ctypes):
print("#define CTYPE_{} {}".format(ctype, i + 1))
print("unsigned int get_type(wchar_t c){")
for ctype, start, end in zip(ctypes, starts, ends):
print(" if({} <= c && c <= {}){{".format(start, end))
print(" return CTYPE_{};".format(ctype))
print(" }")
print(" return CTYPE_UNKNOWN;")
print("}")
print("std::string get_name(unsigned int type){")
for i, ctype in enumerate(ctypes):
print(" if(type == {}){{".format(i + 1))
print(" return \"{}\";".format(ctype.replace("_", " ").lower().title()))
print(" }")
print(" return \"Unknown\";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment