Last active
November 7, 2015 21:49
-
-
Save Teino1978-Corp/139c1bc26a04ffbe0c1a to your computer and use it in GitHub Desktop.
Pinyin Split: Unicode 6.1 all Chinese syllables (yes, there is no uniform phonetic rules, there will be some unusual combinations, such as "ng")
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <? Php | |
| / ************************************************* ************************** | |
| * Pinyin.php | |
| * ------------------------------ | |
| * Date: Nov 7, 2006 | |
| * Copyright: Modify the code from the network, belongs to original author | |
| * Mail: | |
| * Desc:. Pinyin conversion | |
| * History: | |
| * Date: | |
| * Author: | |
| * Modif.: | |
| * Usage Example: | |
| ************************************************** ************************************************************ / | |
| function Pinyin ($ _ String, $ _Code = 'gb2312') | |
| { | |
| $ _DataKey = "A | ai | an | ang | ao | ba | bai | ban | bang | bao | bei | ben | beng | bi | bian | biao | bie | bin | bing | bo | bu | ca | cai | can | cang | cao | ce | ceng | cha ". | |
| "| Chai | chan | chang | chao | che | chen | cheng | chi | chong | chou | chu | chuai | chuan | chuang | chui | chun | chuo | ci | cong | cou | cu |". | |
| "Cuan | cui | cun | cuo | da | dai | dan | dang | dao | de | deng | di | dian | diao | die | ding | diu | dong | dou | du | duan | dui | dun | duo | e | en | er ". | |
| "| Fa | fan | fang | fei | fen | feng | fo | fou | fu | ga | gai | gan | gang | gao | ge | gei | gen | geng | gong | gou | gu | gua | guai | guan | guang | gui ". | |
| "| Gun | guo | ha | hai | han | hang | hao | he | hei | hen | heng | hong | hou | hu | hua | huai | huan | huang | hui | hun | huo | ji | jia | jian | jiang ". | |
| "| Jiao | jie | jin | jing | jiong | jiu | ju | juan | jue | jun | ka | kai | kan | kang | kao | ke | ken | keng | kong | kou | ku | kua | kuai | kuan | kuang ". | |
| "| Kui | kun | kuo | la | lai | lan | lang | lao | le | lei | leng | li | lia | lian | liang | liao | lie | lin | ling | liu | long | lou | lu | lv | luan | lue ". | |
| "| Lun | luo | ma | mai | man | mang | mao | me | mei | men | meng | mi | mian | miao | mie | min | ming | miu | mo | mou | mu | na | nai | nan | nang | nao | ne ". | |
| "| Nei | nen | neng | ni | nian | niang | niao | nie | nin | ning | niu | nong | nu | nv | nuan | nue | nuo | o | ou | pa | pai | pan | pang | pao | pei | pen ". | |
| "| Peng | pi | pian | piao | pie | pin | ping | po | pu | qi | qia | qian | qiang | qiao | qie | qin | qing | qiong | qiu | qu | quan | que | qun | ran | rang ". | |
| "| Rao | re | ren | reng | ri | rong | rou | ru | ruan | rui | run | ruo | sa | sai | san | sang | sao | se | sen | seng | sha | shai | shan | shang | shao | ". | |
| "She | shen | sheng | shi | shou | shu | shua | shuai | shuan | shuang | shui | shun | shuo | si | song | sou | su | suan | sui | sun | suo | ta | tai |". | |
| "Tan | tang | tao | te | teng | ti | tian | tiao | tie | ting | tong | tou | tu | tuan | tui | tun | tuo | wa | wai | wan | wang | wei | wen | weng | wo | wu ". | |
| "| Xi | xia | xian | xiang | xiao | xie | xin | xing | xiong | xiu | xu | xuan | xue | xun | ya | yan | yang | yao | ye | yi | yin | ying | yo | yong | you ". | |
| "| Yu | yuan | yue | yun | za | zai | zan | zang | zao | ze | zei | zen | zeng | zha | zhai | zhan | zhang | zhao | zhe | zhen | zheng | zhi | zhong |". | |
| "Zhou | zhu | zhua | zhuai | zhuan | zhuang | zhui | zhun | zhuo | zi | zong | zou | zu | zuan | zui | zun | zuo"; | |
| $ _DataValue = "-20319 | -20317 | -20304 | -20295 | -20292 | -20283 | -20265 | -20257 | -20242 | -20230 | -20051 | -20036 | -20032 | -20026 | -20002 | - 19990. " | |
| "| -19986 | -19982 | -19976 | -19805 | -19784 | -19775 | -19774 | -19763 | -19756 | -19751 | -19746 | -19741 | -19739 | -19728 | -19725". | |
| "| -19715 | -19540 | -19531 | -19525 | -19515 | -19500 | -19484 | -19479 | -19467 | -19289 | -19288 | -19281 | -19275 | -19270 | -19263". | |
| "| -19261 | -19249 | -19243 | -19242 | -19238 | -19235 | -19227 | -19224 | -19218 | -19212 | -19038 | -19023 | -19018 | -19006 | -19003". | |
| "| -18996 | -18977 | -18961 | -18952 | -18783 | -18774 | -18773 | -18763 | -18756 | -18741 | -18735 | -18731 | -18722 | -18710 | -18697". | |
| "| -18696 | -18526 | -18518 | -18501 | -18490 | -18478 | -18463 | -18448 | -18447 | -18446 | -18239 | -18237 | -18231 | -18220 | -18211". | |
| "| -18201 | -18184 | -18183 | -18181 | -18012 | -17997 | -17988 | -17970 | -17964 | -17961 | -17950 | -17947 | -17931 | -17928 | -17922". | |
| "| -17759 | -17752 | -17733 | -17730 | -17721 | -17703 | -17701 | -17697 | -17692 | -17683 | -17676 | -17496 | -17487 | -17482 | -17468". | |
| "| -17454 | -17433 | -17427 | -17417 | -17202 | -17185 | -16983 | -16970 | -16942 | -16915 | -16733 | -16708 | -16706 | -16689 | -16664". | |
| "| -16657 | -16647 | -16474 | -16470 | -16465 | -16459 | -16452 | -16448 | -16433 | -16429 | -16427 | -16423 | -16419 | -16412 | -16407". | |
| "| -16403 | -16401 | -16393 | -16220 | -16216 | -16212 | -16205 | -16202 | -16187 | -16180 | -16171 | -16169 | -16158 | -16155 | -15959". | |
| "| -15958 | -15944 | -15933 | -15920 | -15915 | -15903 | -15889 | -15878 | -15707 | -15701 | -15681 | -15667 | -15661 | -15659 | -15652". | |
| "| -15640 | -15631 | -15625 | -15454 | -15448 | -15436 | -15435 | -15419 | -15416 | -15408 | -15394 | -15385 | -15377 | -15375 | -15369". | |
| "| -15363 | -15362 | -15183 | -15180 | -15165 | -15158 | -15153 | -15150 | -15149 | -15144 | -15143 | -15141 | -15140 | -15139 | -15128". | |
| "| -15121 | -15119 | -15117 | -15110 | -15109 | -14941 | -14937 | -14933 | -14930 | -14929 | -14928 | -14926 | -14922 | -14921 | -14914". | |
| "| -14908 | -14902 | -14894 | -14889 | -14882 | -14873 | -14871 | -14857 | -14678 | -14674 | -14670 | -14668 | -14663 | -14654 | -14645". | |
| "| -14630 | -14594 | -14429 | -14407 | -14399 | -14384 | -14379 | -14368 | -14355 | -14353 | -14345 | -14170 | -14159 | -14151 | -14149". | |
| "| -14145 | -14140 | -14137 | -14135 | -14125 | -14123 | -14122 | -14112 | -14109 | -14099 | -14097 | -14094 | -14092 | -14090 | -14087". | |
| "| -14083 | -13917 | -13914 | -13910 | -13907 | -13906 | -13905 | -13896 | -13894 | -13878 | -13870 | -13859 | -13847 | -13831 | -13658". | |
| "| -13611 | -13601 | -13406 | -13404 | -13400 | -13398 | -13395 | -13391 | -13387 | -13383 | -13367 | -13359 | -13356 | -13343 | -13340". | |
| "| -13329 | -13326 | -13318 | -13147 | -13138 | -13120 | -13107 | -13096 | -13095 | -13091 | -13076 | -13068 | -13063 | -13060 | -12888". | |
| "| -12875 | -12871 | -12860 | -12858 | -12852 | -12849 | -12838 | -12831 | -12829 | -12812 | -12802 | -12607 | -12597 | -12594 | -12585". | |
| "| -12556 | -12359 | -12346 | -12320 | -12300 | -12120 | -12099 | -12089 | -12074 | -12067 | -12058 | -12039 | -11867 | -11861 | -11847". | |
| "| -11831 | -11798 | -11781 | -11604 | -11589 | -11536 | -11358 | -11340 | -11339 | -11324 | -11303 | -11097 | -11077 | -11067 | -11055". | |
| "| -11052 | -11045 | -11041 | -11038 | -11024 | -11020 | -11019 | -11018 | -11014 | -10838 | -10832 | -10815 | -10800 | -10790 | -10780". | |
| "| -10764 | -10587 | -10544 | -10533 | -10519 | -10331 | -10329 | -10328 | -10322 | -10315 | -10309 | -10307 | -10296 | -10281 | -10274". | |
| "| -10270 | -10262 | -10260 | -10256 | -10254"; | |
| $ _TDataKey = Explode ('|', $ _DataKey); | |
| $ _TDataValue = Explode ('|', $ _DataValue); | |
| $ _Data = (PHP_VERSION> = '5.0') array_combine ($ _ TDataKey, $ _TDataValue): _Array_Combine ($ _ TDataKey, $ _TDataValue);? | |
| arsort ($ _ Data); | |
| reset ($ _ Data); | |
| if (! $ _ Code = 'gb2312') $ _String = _U2_Utf8_Gb ($ _ String); | |
| $ _Res = ''; | |
| for ($ i = 0; $ i <strlen ($ _ String); $ i ++) | |
| { | |
| $ _P = Ord (substr ($ _ String, $ i, 1)); | |
| if ($ _ P> 160) {$ _Q = ord (substr ($ _ String, ++ $ i, 1)); $ _P = $ _P * 256 + $ _Q - 65536;} | |
| . $ _Res = _Pinyin ($ _ P, $ _Data); | |
| } | |
| return preg_replace ("/ [^ a-z0-9] * /", '', $ _Res); | |
| } | |
| function _Pinyin ($ _ Num, $ _Data) | |
| { | |
| if ($ _Num> 0 && $ _Num <160) return chr ($ _ Num); | |
| elseif ($ _ Num <-20319 || $ _Num> -10247) return ''; | |
| else { | |
| foreach ($ _ Data as $ k => $ v) {if ($ v <= $ _ Num) break;} | |
| return $ k; | |
| } | |
| } | |
| function _U2_Utf8_Gb ($ _ C) | |
| { | |
| $ _String = ''; | |
| if ($ _ C <0x80) $ _String = $ _C.; | |
| elseif ($ _ C <0x800) | |
| { | |
| . $ _String = Chr (0xC0 | $ _C >> 6); | |
| . $ _String = Chr (0x80 | $ _C & 0x3F); | |
| } Elseif ($ _ C <0x10000) { | |
| . $ _String = Chr (0xE0 | $ _C >> 12); | |
| . $ _String = Chr (0x80 | $ _C >> 6 & 0x3F); | |
| . $ _String = Chr (0x80 | $ _C & 0x3F); | |
| } Elseif ($ _ C <0x200000) { | |
| . $ _String = Chr (0xF0 | $ _C >> 18); | |
| . $ _String = Chr (0x80 | $ _C >> 12 & 0x3F); | |
| . $ _String = Chr (0x80 | $ _C >> 6 & 0x3F); | |
| . $ _String = Chr (0x80 | $ _C & 0x3F); | |
| } | |
| return iconv ('UTF-8', 'GB2312', $ _String); | |
| } | |
| function _Array_Combine ($ _ Arr1, $ _Arr2) | |
| { | |
| for ($ i = 0; $ i <count ($ _ Arr1); $ i ++) $ _Res [$ _ Arr1 [$ i]] = $ _Arr2 [$ i]; | |
| return $ _Res; | |
| } | |
| echo Pinyin ('This is a small ultra-site, visit http://www.eb163.com'); // default encoding gb | |
| echo Pinyin ('This is the WEB programming network', 1); // The second parameter is arbitrarily set utf8 encoding | |
| ?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #vim: encoding=utf-8 | |
| """ | |
| 拼音分词 | |
| """ | |
| __author__ = "dreampuf<[email protected]>" | |
| import unittest | |
| import cPickle as pickle | |
| from collections import defaultdict | |
| from pprint import pprint | |
| tree = lambda : defaultdict(tree) | |
| py = set([u'gu', u'qiao', u'qian', u'ge', u'gang', u'ga', u'lian', u'liao', u'rou', u'zong', u'tu', u'seng', u'ti', u'te', u'ta', u'nong', u'zhang', u'fan', u'tuan', u'gua', u'die', u'gui', u'guo', u'gun', u'sang', u'diu', u'tei', u'zi', u'ze', u'za', u'chen', u'zu', u'ruo', u'dian', u'diao', u'nei', u'suo', u'sun', u'zhao', u'sui', u'kuo', u'kun', u'kui', u'zhai', u'zuan', u'kua', u'bo', u'ning', u'lei', u'neng', u'men', u'mei', u'geng', u'chang', u'shua', u'cha', u'che', u'fen', u'chi', u'fei', u'chu', u'shui', u'me', u'ma', u'mo', u'mi', u'mu', u'dei', u'cai', u'zhan', u'cao', u'can', u'den', u'wang', u'beng', u'zhuang', u'tan', u'tao', u'tai', u'eng', u'song', u'ping', u'hou', u'cuan', u'\u0148g', u'lan', u'lao', u'fu', u'fa', u'jiong', u'mai', u'xiang', u'mao', u'fo', u'a', u'jiang', u'kuang', u'bing', u'su', u'si', u'sa', u'se', u'zan', u'm\u0300', u'xuan', u'zei', u'zen', u'kong', u'pang', u'le', u'jia', u'jin', u'lo', u'lai', u'li', u'peng', u'lu', u'yi', u'yo', u'ya', u'cen', u'dan', u'dao', u'ye', u'din', u'cei', u'zhen', u'jiu', u'bang', u'nou', u'yu', u'weng', u'wong', u'en', u'ei', u'kang', u'dia', u'er', u'ru', u'keng', u're', u'ren', u'gou', u'ri', u'she', u'tian', u'tiao', u'que', u'shi', u'shun', u'shuo', u'qun', u'xue', u'yun', u'xun', u'fiao', u'yue', u'ding', u'zao', u'rang', u'xi', u'yong', u'zai', u'guan', u'guai', u'dong', u'kuai', u'ying', u'kuan', u'xu', u'xia', u'xie', u'yin', u'rong', u'xin', u'tou', u'nian', u'niao', u'xiu', u'man', u'kou', u'niang', u'hua', u'chao', u'hun', u'huo', u'hui', u'shuan', u'quan', u'shuai', u'chong', u'bei', u'ben', u'dang', u'sai', u'ang', u'sao', u'san', u'reng', u'ran', u'rao', u'ming', u'l\u01dc', u'l\u01da', u'l\u01d8', u'lie', u'lia', u'min', u'miao', u'mian', u'mie', u'liu', u'zou', u'miu', u'nen', u'kai', u'kao', u'kan', u'dai', u'ka', u'ke', u'yang', u'ku', u'deng', u'dou', u'shou', u'chuang', u'nang', u'feng', u'meng', u'cheng', u'di', u'de', u'da', u'gei', u'du', u'gen', u'qu', u'shu', u'sha', u'\u1e3f', u'ban', u'bao', u'bai', u'nun', u'nuo', u'sen', u'kei', u'fang', u'teng', u'lun', u'luo', u'ken', u'wa', u'wo', u'ju', u'tui', u'wu', u'jie', u'ji', u'huang', u'tuo', u'cou', u'la', u'mang', u'ci', u'tun', u'tong', u'ca', u'pou', u'ce', u'gong', u'cu', u'dui', u'dun', u'duo', u'ting', u'qie', u'yao', u'yan', u'pi', u'po', u'suan', u'chua', u'chun', u'\u0148', u'chui', u'gao', u'gan', u'ao', u'gai', u'xiong', u'tang', u'n', u'pian', u'piao', u'cang', u'heng', u'xian', u'xiao', u'bian', u'biao', u'zhua', u'duan', u'cong', u'zhui', u'zhuo', u'zhun', u'hong', u'shuang', u'juan', u'zhei', u'pai', u'shai', u'shan', u'shao', u'pan', u'pao', u'nin', u'nia', u'hang', u'\u01f9g', u'nie', u'zhuai', u'mou', u'zhuan', u'yuan', u'niu', u'zhong', u'qi', u'lin', u'guang', u'nao', u'n\u01d8', u'n\u01da', u'n\u01dc', u'hai', u'han', u'hao', u'wei', u'wen', u'ruan', u'cuo', u'cun', u'cui', u'bin', u'bie', u'l\xfce', u'shen', u'shei', u'fou', u'xing', u'\u0144g', u'qia', u'qiang', u'nuan', u'pen', u'pei', u'\u01f9', u'rui', u'run', u'ba', u'sheng', u'rua', u'bi', u'bu', u'chuan', u'qing', u'chuai', u'pu', u'o', u'chou', u'ou', u'zui', u'luan', u'zuo', u'jian', u'jiao', u'sou', u'wan', u'jing', u'qiong', u'wai', u'long', u'pa', u'liang', u'lou', u'huan', u'hen', u'hei', u'huai', u'n\xfce', u'\u0144', u'jue', u'shang', u'jun', u'hu', u'hm', u'ling', u'ha', u'he', u'zhu', u'ceng', u'zha', u'zhe', u'zhi', u'qin', u'pin', u'ai', u'chai', u'chan', u'pie', u'zeng', u'an', u'qiu', u'ni', u'na', u'zang', u'nai', u'nan', u'ne', u'ng', u'chuo', u'tie', u'you', u'nu', u'zheng', u'leng', u'zun', u'zhou', u'lang', u'e', u'hng']) | |
| def pinyin_sub_gen(nodes, s, v): | |
| for p, sub_node in nodes.iteritems(): | |
| for sub in pinyin_sub_gen(sub_node, p, v): | |
| if s == p: | |
| yield sub | |
| else: | |
| yield [v[s:p]] + sub | |
| yield [] | |
| def pinyin_split(pinyin): | |
| root = tree() | |
| node = root[0] | |
| nodes = [(0, node)] | |
| while nodes: | |
| new_nodes = [] | |
| for pos, node in nodes: | |
| for p in py: | |
| py_len = len(p) | |
| if pinyin[pos:pos+py_len] == p: | |
| new_nodes.append((pos+py_len, node[pos+py_len])) | |
| nodes = new_nodes | |
| #pprint(root) | |
| candidates = list(pinyin_sub_gen(root, 0, pinyin)) | |
| candidates = sorted(candidates, key=lambda x: len(x), reverse=True) | |
| return candidates[0] | |
| class TestCase(unittest.TestCase): | |
| def test_pinyin_dp(self): | |
| v = "woshidewenfensi" | |
| pprint(pinyin_split(v)) | |
| v = "woaibeijinganmen" | |
| pprint(pinyin_split(v)) | |
| if __name__ == "__main__": | |
| unittest.main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| woshidewenfensi -> ['wo', 'shi', 'de', 'wen', 'fen', 'si'] | |
| woaibeijinanmen -> ['wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me', 'n'] | |
| The existence of possible combinations # | |
| [['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'men'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'me'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'men'], | |
| ['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'me'], | |
| ['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'men'], | |
| ['Wo', 'ai', 'bei', 'ji', 'nan', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'me'], | |
| ['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'men'], | |
| ['Wo', 'ai', 'bei', 'jin', 'an', 'me', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'a'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n', 'an'], | |
| ['Wo', 'ai', 'bei', 'ji', 'na', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'nan', 'me'], | |
| ['Wo', 'ai', 'bei', 'ji', 'nan', 'men'], | |
| ['Wo', 'ai', 'bei', 'jin', 'a', 'n'], | |
| ['Wo', 'ai', 'bei', 'jin', 'an', 'me'], | |
| ['Wo', 'ai', 'bei', 'jin', 'an', 'men'], | |
| ['Wo', 'ai', 'bei', 'ji', 'n'], | |
| ['Wo', 'ai', 'bei', 'ji', 'na'], | |
| ['Wo', 'ai', 'bei', 'ji', 'nan'], | |
| ['Wo', 'ai', 'bei', 'jin', 'a'], | |
| ['Wo', 'ai', 'bei', 'jin', 'an'], | |
| ['Wo', 'ai', 'bei', 'ji'], | |
| ['Wo', 'ai', 'bei', 'jin'], | |
| ['Wo', 'ai', 'bei'], | |
| ['Wo', 'a'], | |
| ['Wo', 'ai'], | |
| ['Wo'], | |
| [], | |
| []] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment