Created
July 5, 2017 15:04
-
-
Save musyoku/ad95704c90c77116fa4bd3ea3a746349 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| text = "BASIC_LATIN U+0000 U+007F (128)\n" \ | |
| "LATIN_1_SUPPLEMENT U+0080 U+00FF (128)\n" \ | |
| "LATIN_EXTENDED_A U+0100 U+017F (128)\n" \ | |
| "LATIN_EXTENDED_B U+0180 U+024F (208)\n" \ | |
| "IPA_EXTENSIONS U+0250 U+02AF (96)\n" \ | |
| "SPACING_MODIFIER_LETTERS U+02B0 U+02FF (80)\n" \ | |
| "COMBINING_DIACRITICAL_MARKS U+0300 U+036F (112)\n" \ | |
| "GREEK_AND_COPTIC U+0370 U+03FF (135)\n" \ | |
| "CYRILLIC U+0400 U+04FF (256)\n" \ | |
| "CYRILLIC_SUPPLEMENT U+0500 U+052F (48)\n" \ | |
| "ARMENIAN U+0530 U+058F (89)\n" \ | |
| "HEBREW U+0590 U+05FF (87)\n" \ | |
| "ARABIC U+0600 U+06FF (255)\n" \ | |
| "SYRIAC U+0700 U+074F (77)\n" \ | |
| "ARABIC_SUPPLEMENT U+0750 U+077F (48)\n" \ | |
| "THAANA U+0780 U+07BF (50)\n" \ | |
| "NKO U+07C0 U+07FF (59)\n" \ | |
| "SAMARITAN U+0800 U+083F (61)\n" \ | |
| "MANDAIC U+0840 U+085F (29)\n" \ | |
| "SYRIAC_SUPPLEMENT U+0860 U+086F (11)\n" \ | |
| "ARABIC_EXTENDED_A U+08A0 U+08FF (73)\n" \ | |
| "DEVANAGARI U+0900 U+097F (128)\n" \ | |
| "BENGALI U+0980 U+09FF (95)\n" \ | |
| "GURMUKHI U+0A00 U+0A7F (79)\n" \ | |
| "GUJARATI U+0A80 U+0AFF (91)\n" \ | |
| "ORIYA U+0B00 U+0B7F (90)\n" \ | |
| "TAMIL U+0B80 U+0BFF (72)\n" \ | |
| "TELUGU U+0C00 U+0C7F (96)\n" \ | |
| "KANNADA U+0C80 U+0CFF (88)\n" \ | |
| "MALAYALAM U+0D00 U+0D7F (117)\n" \ | |
| "SINHALA U+0D80 U+0DFF (90)\n" \ | |
| "THAI U+0E00 U+0E7F (87)\n" \ | |
| "LAO U+0E80 U+0EFF (67)\n" \ | |
| "TIBETAN U+0F00 U+0FFF (211)\n" \ | |
| "MYANMAR U+1000 U+109F (160)\n" \ | |
| "GEORGIAN U+10A0 U+10FF (88)\n" \ | |
| "HANGUL_JAMO U+1100 U+11FF (256)\n" \ | |
| "ETHIOPIC U+1200 U+137F (358)\n" \ | |
| "ETHIOPIC_SUPPLEMENT U+1380 U+139F (26)\n" \ | |
| "CHEROKEE U+13A0 U+13FF (92)\n" \ | |
| "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS U+1400 U+167F (640)\n" \ | |
| "OGHAM U+1680 U+169F (29)\n" \ | |
| "RUNIC U+16A0 U+16FF (89)\n" \ | |
| "TAGALOG U+1700 U+171F (20)\n" \ | |
| "HANUNOO U+1720 U+173F (23)\n" \ | |
| "BUHID U+1740 U+175F (20)\n" \ | |
| "TAGBANWA U+1760 U+177F (18)\n" \ | |
| "KHMER U+1780 U+17FF (114)\n" \ | |
| "MONGOLIAN U+1800 U+18AF (156)\n" \ | |
| "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED U+18B0 U+18FF (70)\n" \ | |
| "LIMBU U+1900 U+194F (68)\n" \ | |
| "TAI_LE U+1950 U+197F (35)\n" \ | |
| "NEW_TAI_LUE U+1980 U+19DF (83)\n" \ | |
| "KHMER_SYMBOLS U+19E0 U+19FF (32)\n" \ | |
| "BUGINESE U+1A00 U+1A1F (30)\n" \ | |
| "TAI_THAM U+1A20 U+1AAF (127)\n" \ | |
| "COMBINING_DIACRITICAL_MARKS_EXTENDED U+1AB0 U+1AFF (15)\n" \ | |
| "BALINESE U+1B00 U+1B7F (121)\n" \ | |
| "SUNDANESE U+1B80 U+1BBF (64)\n" \ | |
| "BATAK U+1BC0 U+1BFF (56)\n" \ | |
| "LEPCHA U+1C00 U+1C4F (74)\n" \ | |
| "OL_CHIKI U+1C50 U+1C7F (48)\n" \ | |
| "CYRILLIC_EXTENDED_C U+1C80 U+1C8F (9)\n" \ | |
| "SUNDANESE_SUPPLEMENT U+1CC0 U+1CCF (8)\n" \ | |
| "VEDIC_EXTENSIONS U+1CD0 U+1CFF (42)\n" \ | |
| "PHONETIC_EXTENSIONS U+1D00 U+1D7F (128)\n" \ | |
| "PHONETIC_EXTENSIONS_SUPPLEMENT U+1D80 U+1DBF (64)\n" \ | |
| "COMBINING_DIACRITICAL_MARKS_SUPPLEMENT U+1DC0 U+1DFF (63)\n" \ | |
| "LATIN_EXTENDED_ADDITIONAL U+1E00 U+1EFF (256)\n" \ | |
| "GREEK_EXTENDED U+1F00 U+1FFF (233)\n" \ | |
| "GENERAL_PUNCTUATION U+2000 U+206F (111)\n" \ | |
| "SUPERSCRIPTS_AND_SUBSCRIPTS U+2070 U+209F (42)\n" \ | |
| "CURRENCY_SYMBOLS U+20A0 U+20CF (32)\n" \ | |
| "COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS U+20D0 U+20FF (33)\n" \ | |
| "LETTERLIKE_SYMBOLS U+2100 U+214F (80)\n" \ | |
| "NUMBER_FORMS U+2150 U+218F (60)\n" \ | |
| "ARROWS U+2190 U+21FF (112)\n" \ | |
| "MATHEMATICAL_OPERATORS U+2200 U+22FF (256)\n" \ | |
| "MISCELLANEOUS_TECHNICAL U+2300 U+23FF (256)\n" \ | |
| "CONTROL_PICTURES U+2400 U+243F (39)\n" \ | |
| "OPTICAL_CHARACTER_RECOGNITION U+2440 U+245F (11)\n" \ | |
| "ENCLOSED_ALPHANUMERICS U+2460 U+24FF (160)\n" \ | |
| "BOX_DRAWING U+2500 U+257F (128)\n" \ | |
| "BLOCK_ELEMENTS U+2580 U+259F (32)\n" \ | |
| "GEOMETRIC_SHAPES U+25A0 U+25FF (96)\n" \ | |
| "MISCELLANEOUS_SYMBOLS U+2600 U+26FF (256)\n" \ | |
| "DINGBATS U+2700 U+27BF (192)\n" \ | |
| "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A U+27C0 U+27EF (48)\n" \ | |
| "SUPPLEMENTAL_ARROWS_A U+27F0 U+27FF (16)\n" \ | |
| "BRAILLE_PATTERNS U+2800 U+28FF (256)\n" \ | |
| "SUPPLEMENTAL_ARROWS_B U+2900 U+297F (128)\n" \ | |
| "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B U+2980 U+29FF (128)\n" \ | |
| "SUPPLEMENTAL_MATHEMATICAL_OPERATORS U+2A00 U+2AFF (256)\n" \ | |
| "MISCELLANEOUS_SYMBOLS_AND_ARROWS U+2B00 U+2BFF (207)\n" \ | |
| "GLAGOLITIC U+2C00 U+2C5F (94)\n" \ | |
| "LATIN_EXTENDED_C U+2C60 U+2C7F (32)\n" \ | |
| "COPTIC U+2C80 U+2CFF (123)\n" \ | |
| "GEORGIAN_SUPPLEMENT U+2D00 U+2D2F (40)\n" \ | |
| "TIFINAGH U+2D30 U+2D7F (59)\n" \ | |
| "ETHIOPIC_EXTENDED U+2D80 U+2DDF (79)\n" \ | |
| "CYRILLIC_EXTENDED_A U+2DE0 U+2DFF (32)\n" \ | |
| "SUPPLEMENTAL_PUNCTUATION U+2E00 U+2E7F (74)\n" \ | |
| "CJK_RADICALS_SUPPLEMENT U+2E80 U+2EFF (115)\n" \ | |
| "KANGXI_RADICALS U+2F00 U+2FDF (214)\n" \ | |
| "IDEOGRAPHIC_DESCRIPTION_CHARACTERS U+2FF0 U+2FFF (12)\n" \ | |
| "CJK_SYMBOLS_AND_PUNCTUATION U+3000 U+303F (64)\n" \ | |
| "HIRAGANA U+3040 U+309F (93)\n" \ | |
| "KATAKANA U+30A0 U+30FF (96)\n" \ | |
| "BOPOMOFO U+3100 U+312F (42)\n" \ | |
| "HANGUL_COMPATIBILITY_JAMO U+3130 U+318F (94)\n" \ | |
| "KANBUN U+3190 U+319F (16)\n" \ | |
| "BOPOMOFO_EXTENDED U+31A0 U+31BF (27)\n" \ | |
| "CJK_STROKES U+31C0 U+31EF (36)\n" \ | |
| "KATAKANA_PHONETIC_EXTENSIONS U+31F0 U+31FF (16)\n" \ | |
| "ENCLOSED_CJK_LETTERS_AND_MONTHS U+3200 U+32FF (254)\n" \ | |
| "CJK_COMPATIBILITY U+3300 U+33FF (256)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A U+3400 U+4DBF (6192)\n" \ | |
| "YIJING_HEXAGRAM_SYMBOLS U+4DC0 U+4DFF (64)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS U+4E00 U+9FFF (20942)\n" \ | |
| "YI_SYLLABLES U+A000 U+A48F (1165)\n" \ | |
| "YI_RADICALS U+A490 U+A4CF (55)\n" \ | |
| "LISU U+A4D0 U+A4FF (48)\n" \ | |
| "VAI U+A500 U+A63F (300)\n" \ | |
| "CYRILLIC_EXTENDED_B U+A640 U+A69F (96)\n" \ | |
| "BAMUM U+A6A0 U+A6FF (88)\n" \ | |
| "MODIFIER_TONE_LETTERS U+A700 U+A71F (32)\n" \ | |
| "LATIN_EXTENDED_D U+A720 U+A7FF (160)\n" \ | |
| "SYLOTI_NAGRI U+A800 U+A82F (44)\n" \ | |
| "COMMON_INDIC_NUMBER_FORMS U+A830 U+A83F (10)\n" \ | |
| "PHAGS_PA U+A840 U+A87F (56)\n" \ | |
| "SAURASHTRA U+A880 U+A8DF (82)\n" \ | |
| "DEVANAGARI_EXTENDED U+A8E0 U+A8FF (30)\n" \ | |
| "KAYAH_LI U+A900 U+A92F (48)\n" \ | |
| "REJANG U+A930 U+A95F (37)\n" \ | |
| "HANGUL_JAMO_EXTENDED_A U+A960 U+A97F (29)\n" \ | |
| "JAVANESE U+A980 U+A9DF (91)\n" \ | |
| "MYANMAR_EXTENDED_B U+A9E0 U+A9FF (31)\n" \ | |
| "CHAM U+AA00 U+AA5F (83)\n" \ | |
| "MYANMAR_EXTENDED_A U+AA60 U+AA7F (32)\n" \ | |
| "TAI_VIET U+AA80 U+AADF (72)\n" \ | |
| "MEETEI_MAYEK_EXTENSIONS U+AAE0 U+AAFF (23)\n" \ | |
| "ETHIOPIC_EXTENDED_A U+AB00 U+AB2F (32)\n" \ | |
| "LATIN_EXTENDED_E U+AB30 U+AB6F (54)\n" \ | |
| "CHEROKEE_SUPPLEMENT U+AB70 U+ABBF (80)\n" \ | |
| "MEETEI_MAYEK U+ABC0 U+ABFF (56)\n" \ | |
| "HANGUL_SYLLABLES U+AC00 U+D7AF (2)\n" \ | |
| "HANGUL_JAMO_EXTENDED_B U+D7B0 U+D7FF (72)\n" \ | |
| "HIGH_SURROGATES U+D800 U+DB7F (2)\n" \ | |
| "HIGH_PRIVATE_USE_SURROGATES U+DB80 U+DBFF (2)\n" \ | |
| "LOW_SURROGATES U+DC00 U+DFFF (2)\n" \ | |
| "PRIVATE_USE_AREA U+E000 U+F8FF (2)\n" \ | |
| "CJK_COMPATIBILITY_IDEOGRAPHS U+F900 U+FAFF (472)\n" \ | |
| "ALPHABETIC_PRESENTATION_FORMS U+FB00 U+FB4F (58)\n" \ | |
| "ARABIC_PRESENTATION_FORMS_A U+FB50 U+FDFF (643)\n" \ | |
| "VARIATION_SELECTORS U+FE00 U+FE0F (16)\n" \ | |
| "VERTICAL_FORMS U+FE10 U+FE1F (10)\n" \ | |
| "COMBINING_HALF_MARKS U+FE20 U+FE2F (16)\n" \ | |
| "CJK_COMPATIBILITY_FORMS U+FE30 U+FE4F (32)\n" \ | |
| "SMALL_FORM_VARIANTS U+FE50 U+FE6F (26)\n" \ | |
| "ARABIC_PRESENTATION_FORMS_B U+FE70 U+FEFF (141)\n" \ | |
| "HALFWIDTH_AND_FULLWIDTH_FORMS U+FF00 U+FFEF (225)\n" \ | |
| "SPECIALS U+FFF0 U+FFFF (7)\n" \ | |
| "LINEAR_B_SYLLABARY U+10000 U+1007F (88)\n" \ | |
| "LINEAR_B_IDEOGRAMS U+10080 U+100FF (123)\n" \ | |
| "AEGEAN_NUMBERS U+10100 U+1013F (57)\n" \ | |
| "ANCIENT_GREEK_NUMBERS U+10140 U+1018F (79)\n" \ | |
| "ANCIENT_SYMBOLS U+10190 U+101CF (13)\n" \ | |
| "PHAISTOS_DISC U+101D0 U+101FF (46)\n" \ | |
| "LYCIAN U+10280 U+1029F (29)\n" \ | |
| "CARIAN U+102A0 U+102DF (49)\n" \ | |
| "COPTIC_EPACT_NUMBERS U+102E0 U+102FF (28)\n" \ | |
| "OLD_ITALIC U+10300 U+1032F (39)\n" \ | |
| "GOTHIC U+10330 U+1034F (27)\n" \ | |
| "OLD_PERMIC U+10350 U+1037F (43)\n" \ | |
| "UGARITIC U+10380 U+1039F (31)\n" \ | |
| "OLD_PERSIAN U+103A0 U+103DF (50)\n" \ | |
| "DESERET U+10400 U+1044F (80)\n" \ | |
| "SHAVIAN U+10450 U+1047F (48)\n" \ | |
| "OSMANYA U+10480 U+104AF (40)\n" \ | |
| "OSAGE U+104B0 U+104FF (72)\n" \ | |
| "ELBASAN U+10500 U+1052F (40)\n" \ | |
| "CAUCASIAN_ALBANIAN U+10530 U+1056F (53)\n" \ | |
| "LINEAR_A U+10600 U+1077F (341)\n" \ | |
| "CYPRIOT_SYLLABARY U+10800 U+1083F (55)\n" \ | |
| "IMPERIAL_ARAMAIC U+10840 U+1085F (31)\n" \ | |
| "PALMYRENE U+10860 U+1087F (32)\n" \ | |
| "NABATAEAN U+10880 U+108AF (40)\n" \ | |
| "HATRAN U+108E0 U+108FF (26)\n" \ | |
| "PHOENICIAN U+10900 U+1091F (29)\n" \ | |
| "LYDIAN U+10920 U+1093F (27)\n" \ | |
| "MEROITIC_HIEROGLYPHS U+10980 U+1099F (32)\n" \ | |
| "MEROITIC_CURSIVE U+109A0 U+109FF (90)\n" \ | |
| "KHAROSHTHI U+10A00 U+10A5F (65)\n" \ | |
| "OLD_SOUTH_ARABIAN U+10A60 U+10A7F (32)\n" \ | |
| "OLD_NORTH_ARABIAN U+10A80 U+10A9F (32)\n" \ | |
| "MANICHAEAN U+10AC0 U+10AFF (51)\n" \ | |
| "AVESTAN U+10B00 U+10B3F (61)\n" \ | |
| "INSCRIPTIONAL_PARTHIAN U+10B40 U+10B5F (30)\n" \ | |
| "INSCRIPTIONAL_PAHLAVI U+10B60 U+10B7F (27)\n" \ | |
| "PSALTER_PAHLAVI U+10B80 U+10BAF (29)\n" \ | |
| "OLD_TURKIC U+10C00 U+10C4F (73)\n" \ | |
| "OLD_HUNGARIAN U+10C80 U+10CFF (108)\n" \ | |
| "RUMI_NUMERAL_SYMBOLS U+10E60 U+10E7F (31)\n" \ | |
| "BRAHMI U+11000 U+1107F (109)\n" \ | |
| "KAITHI U+11080 U+110CF (66)\n" \ | |
| "SORA_SOMPENG U+110D0 U+110FF (35)\n" \ | |
| "CHAKMA U+11100 U+1114F (67)\n" \ | |
| "MAHAJANI U+11150 U+1117F (39)\n" \ | |
| "SHARADA U+11180 U+111DF (94)\n" \ | |
| "SINHALA_ARCHAIC_NUMBERS U+111E0 U+111FF (20)\n" \ | |
| "KHOJKI U+11200 U+1124F (62)\n" \ | |
| "MULTANI U+11280 U+112AF (38)\n" \ | |
| "KHUDAWADI U+112B0 U+112FF (69)\n" \ | |
| "GRANTHA U+11300 U+1137F (85)\n" \ | |
| "NEWA U+11400 U+1147F (92)\n" \ | |
| "TIRHUTA U+11480 U+114DF (82)\n" \ | |
| "SIDDHAM U+11580 U+115FF (92)\n" \ | |
| "MODI U+11600 U+1165F (79)\n" \ | |
| "MONGOLIAN_SUPPLEMENT U+11660 U+1167F (13)\n" \ | |
| "TAKRI U+11680 U+116CF (66)\n" \ | |
| "AHOM U+11700 U+1173F (57)\n" \ | |
| "WARANG_CITI U+118A0 U+118FF (84)\n" \ | |
| "ZANABAZAR_SQUARE U+11A00 U+11A4F (72)\n" \ | |
| "SOYOMBO U+11A50 U+11AAF (80)\n" \ | |
| "PAU_CIN_HAU U+11AC0 U+11AFF (57)\n" \ | |
| "BHAIKSUKI U+11C00 U+11C6F (97)\n" \ | |
| "MARCHEN U+11C70 U+11CBF (68)\n" \ | |
| "MASARAM_GONDI U+11D00 U+11D5F (75)\n" \ | |
| "CUNEIFORM U+12000 U+123FF (922)\n" \ | |
| "CUNEIFORM_NUMBERS_AND_PUNCTUATION U+12400 U+1247F (116)\n" \ | |
| "EARLY_DYNASTIC_CUNEIFORM U+12480 U+1254F (196)\n" \ | |
| "EGYPTIAN_HIEROGLYPHS U+13000 U+1342F (1071)\n" \ | |
| "ANATOLIAN_HIEROGLYPHS U+14400 U+1467F (583)\n" \ | |
| "BAMUM_SUPPLEMENT U+16800 U+16A3F (569)\n" \ | |
| "MRO U+16A40 U+16A6F (43)\n" \ | |
| "BASSA_VAH U+16AD0 U+16AFF (36)\n" \ | |
| "PAHAWH_HMONG U+16B00 U+16B8F (127)\n" \ | |
| "MIAO U+16F00 U+16F9F (133)\n" \ | |
| "IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION U+16FE0 U+16FFF (2)\n" \ | |
| "TANGUT U+17000 U+187FF (2)\n" \ | |
| "TANGUT_COMPONENTS U+18800 U+18AFF (755)\n" \ | |
| "KANA_SUPPLEMENT U+1B000 U+1B0FF (256)\n" \ | |
| "KANA_EXTENDED_A U+1B100 U+1B12F (31)\n" \ | |
| "NUSHU U+1B170 U+1B2FF (396)\n" \ | |
| "DUPLOYAN U+1BC00 U+1BC9F (143)\n" \ | |
| "SHORTHAND_FORMAT_CONTROLS U+1BCA0 U+1BCAF (4)\n" \ | |
| "BYZANTINE_MUSICAL_SYMBOLS U+1D000 U+1D0FF (246)\n" \ | |
| "MUSICAL_SYMBOLS U+1D100 U+1D1FF (231)\n" \ | |
| "ANCIENT_GREEK_MUSICAL_NOTATION U+1D200 U+1D24F (70)\n" \ | |
| "TAI_XUAN_JING_SYMBOLS U+1D300 U+1D35F (87)\n" \ | |
| "COUNTING_ROD_NUMERALS U+1D360 U+1D37F (18)\n" \ | |
| "MATHEMATICAL_ALPHANUMERIC_SYMBOLS U+1D400 U+1D7FF (996)\n" \ | |
| "SUTTON_SIGNWRITING U+1D800 U+1DAAF (672)\n" \ | |
| "GLAGOLITIC_SUPPLEMENT U+1E000 U+1E02F (38)\n" \ | |
| "MENDE_KIKAKUI U+1E800 U+1E8DF (213)\n" \ | |
| "ADLAM U+1E900 U+1E95F (87)\n" \ | |
| "ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS U+1EE00 U+1EEFF (143)\n" \ | |
| "MAHJONG_TILES U+1F000 U+1F02F (44)\n" \ | |
| "DOMINO_TILES U+1F030 U+1F09F (100)\n" \ | |
| "PLAYING_CARDS U+1F0A0 U+1F0FF (82)\n" \ | |
| "ENCLOSED_ALPHANUMERIC_SUPPLEMENT U+1F100 U+1F1FF (191)\n" \ | |
| "ENCLOSED_IDEOGRAPHIC_SUPPLEMENT U+1F200 U+1F2FF (64)\n" \ | |
| "MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS U+1F300 U+1F5FF (768)\n" \ | |
| "EMOTICONS U+1F600 U+1F64F (80)\n" \ | |
| "ORNAMENTAL_DINGBATS U+1F650 U+1F67F (48)\n" \ | |
| "TRANSPORT_AND_MAP_SYMBOLS U+1F680 U+1F6FF (107)\n" \ | |
| "ALCHEMICAL_SYMBOLS U+1F700 U+1F77F (116)\n" \ | |
| "GEOMETRIC_SHAPES_EXTENDED U+1F780 U+1F7FF (85)\n" \ | |
| "SUPPLEMENTAL_ARROWS_C U+1F800 U+1F8FF (148)\n" \ | |
| "SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS U+1F900 U+1F9FF (148)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B U+20000 U+2A6DF (42676)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C U+2A700 U+2B73F (60)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D U+2B740 U+2B81F (27)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E U+2B820 U+2CEAF (6)\n" \ | |
| "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F U+2CEB0 U+2EBEF (2)\n" \ | |
| "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT U+2F800 U+2FA1F (542)\n" \ | |
| "TAGS U+E0000 U+E007F (97)\n" \ | |
| "VARIATION_SELECTORS_SUPPLEMENT U+E0100 U+E01EF (240)\n" \ | |
| "SUPPLEMENTARY_PRIVATE_USE_AREA_A U+F0000 U+FFFFF (4)\n" \ | |
| "SUPPLEMENTARY_PRIVATE_USE_AREA_B U+100000 U+10FFFF (4)\n" \ | |
| lines = text.split("\n") | |
| ctypes = [] | |
| starts = [] | |
| ends = [] | |
| for line in lines: | |
| m = re.match(r"([^\t]+) ([^\t]+) ([^\t]+) ([^\t]+)", line) | |
| if m: | |
| ctypes.append(m.group(1)) | |
| starts.append(m.group(2).replace("U+", "0x")) | |
| ends.append(m.group(3).replace("U+", "0x")) | |
| print("#define CTYPE_UNKNOWN 0") | |
| for i, ctype in enumerate(ctypes): | |
| print("#define CTYPE_{} {}".format(ctype, i + 1)) | |
| print("unsigned int get_type(wchar_t c){") | |
| for ctype, start, end in zip(ctypes, starts, ends): | |
| print(" if({} <= c && c <= {}){{".format(start, end)) | |
| print(" return CTYPE_{};".format(ctype)) | |
| print(" }") | |
| print(" return CTYPE_UNKNOWN;") | |
| print("}") | |
| print("std::string get_name(unsigned int type){") | |
| for i, ctype in enumerate(ctypes): | |
| print(" if(type == {}){{".format(i + 1)) | |
| print(" return \"{}\";".format(ctype.replace("_", " ").lower().title())) | |
| print(" }") | |
| print(" return \"Unknown\";") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment