Skip to content

Instantly share code, notes, and snippets.

@Teino1978-Corp
Last active November 7, 2015 21:49
Show Gist options
  • Select an option

  • Save Teino1978-Corp/139c1bc26a04ffbe0c1a to your computer and use it in GitHub Desktop.

Select an option

Save Teino1978-Corp/139c1bc26a04ffbe0c1a to your computer and use it in GitHub Desktop.
Pinyin Split: Unicode 6.1 all Chinese syllables (yes, there is no uniform phonetic rules, there will be some unusual combinations, such as "ng")
<? Php
/ ************************************************* **************************
* Pinyin.php
* ------------------------------
* Date: Nov 7, 2006
* Copyright: Modify the code from the network, belongs to original author
* Mail:
* Desc:. Pinyin conversion
* History:
* Date:
* Author:
* Modif.:
* Usage Example:
************************************************** ************************************************************ /
function Pinyin ($ _ String, $ _Code = 'gb2312')
{
$ _DataKey = "A | ai | an | ang | ao | ba | bai | ban | bang | bao | bei | ben | beng | bi | bian | biao | bie | bin | bing | bo | bu | ca | cai | can | cang | cao | ce | ceng | cha ".
"| Chai | chan | chang | chao | che | chen | cheng | chi | chong | chou | chu | chuai | chuan | chuang | chui | chun | chuo | ci | cong | cou | cu |".
"Cuan | cui | cun | cuo | da | dai | dan | dang | dao | de | deng | di | dian | diao | die | ding | diu | dong | dou | du | duan | dui | dun | duo | e | en | er ".
"| Fa | fan | fang | fei | fen | feng | fo | fou | fu | ga | gai | gan | gang | gao | ge | gei | gen | geng | gong | gou | gu | gua | guai | guan | guang | gui ".
"| Gun | guo | ha | hai | han | hang | hao | he | hei | hen | heng | hong | hou | hu | hua | huai | huan | huang | hui | hun | huo | ji | jia | jian | jiang ".
"| Jiao | jie | jin | jing | jiong | jiu | ju | juan | jue | jun | ka | kai | kan | kang | kao | ke | ken | keng | kong | kou | ku | kua | kuai | kuan | kuang ".
"| Kui | kun | kuo | la | lai | lan | lang | lao | le | lei | leng | li | lia | lian | liang | liao | lie | lin | ling | liu | long | lou | lu | lv | luan | lue ".
"| Lun | luo | ma | mai | man | mang | mao | me | mei | men | meng | mi | mian | miao | mie | min | ming | miu | mo | mou | mu | na | nai | nan | nang | nao | ne ".
"| Nei | nen | neng | ni | nian | niang | niao | nie | nin | ning | niu | nong | nu | nv | nuan | nue | nuo | o | ou | pa | pai | pan | pang | pao | pei | pen ".
"| Peng | pi | pian | piao | pie | pin | ping | po | pu | qi | qia | qian | qiang | qiao | qie | qin | qing | qiong | qiu | qu | quan | que | qun | ran | rang ".
"| Rao | re | ren | reng | ri | rong | rou | ru | ruan | rui | run | ruo | sa | sai | san | sang | sao | se | sen | seng | sha | shai | shan | shang | shao | ".
"She | shen | sheng | shi | shou | shu | shua | shuai | shuan | shuang | shui | shun | shuo | si | song | sou | su | suan | sui | sun | suo | ta | tai |".
"Tan | tang | tao | te | teng | ti | tian | tiao | tie | ting | tong | tou | tu | tuan | tui | tun | tuo | wa | wai | wan | wang | wei | wen | weng | wo | wu ".
"| Xi | xia | xian | xiang | xiao | xie | xin | xing | xiong | xiu | xu | xuan | xue | xun | ya | yan | yang | yao | ye | yi | yin | ying | yo | yong | you ".
"| Yu | yuan | yue | yun | za | zai | zan | zang | zao | ze | zei | zen | zeng | zha | zhai | zhan | zhang | zhao | zhe | zhen | zheng | zhi | zhong |".
"Zhou | zhu | zhua | zhuai | zhuan | zhuang | zhui | zhun | zhuo | zi | zong | zou | zu | zuan | zui | zun | zuo";
$ _DataValue = "-20319 | -20317 | -20304 | -20295 | -20292 | -20283 | -20265 | -20257 | -20242 | -20230 | -20051 | -20036 | -20032 | -20026 | -20002 | - 19990. "
"| -19986 | -19982 | -19976 | -19805 | -19784 | -19775 | -19774 | -19763 | -19756 | -19751 | -19746 | -19741 | -19739 | -19728 | -19725".
"| -19715 | -19540 | -19531 | -19525 | -19515 | -19500 | -19484 | -19479 | -19467 | -19289 | -19288 | -19281 | -19275 | -19270 | -19263".
"| -19261 | -19249 | -19243 | -19242 | -19238 | -19235 | -19227 | -19224 | -19218 | -19212 | -19038 | -19023 | -19018 | -19006 | -19003".
"| -18996 | -18977 | -18961 | -18952 | -18783 | -18774 | -18773 | -18763 | -18756 | -18741 | -18735 | -18731 | -18722 | -18710 | -18697".
"| -18696 | -18526 | -18518 | -18501 | -18490 | -18478 | -18463 | -18448 | -18447 | -18446 | -18239 | -18237 | -18231 | -18220 | -18211".
"| -18201 | -18184 | -18183 | -18181 | -18012 | -17997 | -17988 | -17970 | -17964 | -17961 | -17950 | -17947 | -17931 | -17928 | -17922".
"| -17759 | -17752 | -17733 | -17730 | -17721 | -17703 | -17701 | -17697 | -17692 | -17683 | -17676 | -17496 | -17487 | -17482 | -17468".
"| -17454 | -17433 | -17427 | -17417 | -17202 | -17185 | -16983 | -16970 | -16942 | -16915 | -16733 | -16708 | -16706 | -16689 | -16664".
"| -16657 | -16647 | -16474 | -16470 | -16465 | -16459 | -16452 | -16448 | -16433 | -16429 | -16427 | -16423 | -16419 | -16412 | -16407".
"| -16403 | -16401 | -16393 | -16220 | -16216 | -16212 | -16205 | -16202 | -16187 | -16180 | -16171 | -16169 | -16158 | -16155 | -15959".
"| -15958 | -15944 | -15933 | -15920 | -15915 | -15903 | -15889 | -15878 | -15707 | -15701 | -15681 | -15667 | -15661 | -15659 | -15652".
"| -15640 | -15631 | -15625 | -15454 | -15448 | -15436 | -15435 | -15419 | -15416 | -15408 | -15394 | -15385 | -15377 | -15375 | -15369".
"| -15363 | -15362 | -15183 | -15180 | -15165 | -15158 | -15153 | -15150 | -15149 | -15144 | -15143 | -15141 | -15140 | -15139 | -15128".
"| -15121 | -15119 | -15117 | -15110 | -15109 | -14941 | -14937 | -14933 | -14930 | -14929 | -14928 | -14926 | -14922 | -14921 | -14914".
"| -14908 | -14902 | -14894 | -14889 | -14882 | -14873 | -14871 | -14857 | -14678 | -14674 | -14670 | -14668 | -14663 | -14654 | -14645".
"| -14630 | -14594 | -14429 | -14407 | -14399 | -14384 | -14379 | -14368 | -14355 | -14353 | -14345 | -14170 | -14159 | -14151 | -14149".
"| -14145 | -14140 | -14137 | -14135 | -14125 | -14123 | -14122 | -14112 | -14109 | -14099 | -14097 | -14094 | -14092 | -14090 | -14087".
"| -14083 | -13917 | -13914 | -13910 | -13907 | -13906 | -13905 | -13896 | -13894 | -13878 | -13870 | -13859 | -13847 | -13831 | -13658".
"| -13611 | -13601 | -13406 | -13404 | -13400 | -13398 | -13395 | -13391 | -13387 | -13383 | -13367 | -13359 | -13356 | -13343 | -13340".
"| -13329 | -13326 | -13318 | -13147 | -13138 | -13120 | -13107 | -13096 | -13095 | -13091 | -13076 | -13068 | -13063 | -13060 | -12888".
"| -12875 | -12871 | -12860 | -12858 | -12852 | -12849 | -12838 | -12831 | -12829 | -12812 | -12802 | -12607 | -12597 | -12594 | -12585".
"| -12556 | -12359 | -12346 | -12320 | -12300 | -12120 | -12099 | -12089 | -12074 | -12067 | -12058 | -12039 | -11867 | -11861 | -11847".
"| -11831 | -11798 | -11781 | -11604 | -11589 | -11536 | -11358 | -11340 | -11339 | -11324 | -11303 | -11097 | -11077 | -11067 | -11055".
"| -11052 | -11045 | -11041 | -11038 | -11024 | -11020 | -11019 | -11018 | -11014 | -10838 | -10832 | -10815 | -10800 | -10790 | -10780".
"| -10764 | -10587 | -10544 | -10533 | -10519 | -10331 | -10329 | -10328 | -10322 | -10315 | -10309 | -10307 | -10296 | -10281 | -10274".
"| -10270 | -10262 | -10260 | -10256 | -10254";
$ _TDataKey = Explode ('|', $ _DataKey);
$ _TDataValue = Explode ('|', $ _DataValue);
$ _Data = (PHP_VERSION> = '5.0') array_combine ($ _ TDataKey, $ _TDataValue): _Array_Combine ($ _ TDataKey, $ _TDataValue);?
arsort ($ _ Data);
reset ($ _ Data);
if (! $ _ Code = 'gb2312') $ _String = _U2_Utf8_Gb ($ _ String);
$ _Res = '';
for ($ i = 0; $ i <strlen ($ _ String); $ i ++)
{
$ _P = Ord (substr ($ _ String, $ i, 1));
if ($ _ P> 160) {$ _Q = ord (substr ($ _ String, ++ $ i, 1)); $ _P = $ _P * 256 + $ _Q - 65536;}
. $ _Res = _Pinyin ($ _ P, $ _Data);
}
return preg_replace ("/ [^ a-z0-9] * /", '', $ _Res);
}
function _Pinyin ($ _ Num, $ _Data)
{
if ($ _Num> 0 && $ _Num <160) return chr ($ _ Num);
elseif ($ _ Num <-20319 || $ _Num> -10247) return '';
else {
foreach ($ _ Data as $ k => $ v) {if ($ v <= $ _ Num) break;}
return $ k;
}
}
function _U2_Utf8_Gb ($ _ C)
{
$ _String = '';
if ($ _ C <0x80) $ _String = $ _C.;
elseif ($ _ C <0x800)
{
. $ _String = Chr (0xC0 | $ _C >> 6);
. $ _String = Chr (0x80 | $ _C & 0x3F);
} Elseif ($ _ C <0x10000) {
. $ _String = Chr (0xE0 | $ _C >> 12);
. $ _String = Chr (0x80 | $ _C >> 6 & 0x3F);
. $ _String = Chr (0x80 | $ _C & 0x3F);
} Elseif ($ _ C <0x200000) {
. $ _String = Chr (0xF0 | $ _C >> 18);
. $ _String = Chr (0x80 | $ _C >> 12 & 0x3F);
. $ _String = Chr (0x80 | $ _C >> 6 & 0x3F);
. $ _String = Chr (0x80 | $ _C & 0x3F);
}
return iconv ('UTF-8', 'GB2312', $ _String);
}
function _Array_Combine ($ _ Arr1, $ _Arr2)
{
for ($ i = 0; $ i <count ($ _ Arr1); $ i ++) $ _Res [$ _ Arr1 [$ i]] = $ _Arr2 [$ i];
return $ _Res;
}
echo Pinyin ('This is a small ultra-site, visit http://www.eb163.com'); // default encoding gb
echo Pinyin ('This is the WEB programming network', 1); // The second parameter is arbitrarily set utf8 encoding
?>
#!/usr/bin/env python
#vim: encoding=utf-8
"""
拼音分词
"""
__author__ = "dreampuf<[email protected]>"
import unittest
import cPickle as pickle
from collections import defaultdict
from pprint import pprint
tree = lambda : defaultdict(tree)
py = set([u'gu', u'qiao', u'qian', u'ge', u'gang', u'ga', u'lian', u'liao', u'rou', u'zong', u'tu', u'seng', u'ti', u'te', u'ta', u'nong', u'zhang', u'fan', u'tuan', u'gua', u'die', u'gui', u'guo', u'gun', u'sang', u'diu', u'tei', u'zi', u'ze', u'za', u'chen', u'zu', u'ruo', u'dian', u'diao', u'nei', u'suo', u'sun', u'zhao', u'sui', u'kuo', u'kun', u'kui', u'zhai', u'zuan', u'kua', u'bo', u'ning', u'lei', u'neng', u'men', u'mei', u'geng', u'chang', u'shua', u'cha', u'che', u'fen', u'chi', u'fei', u'chu', u'shui', u'me', u'ma', u'mo', u'mi', u'mu', u'dei', u'cai', u'zhan', u'cao', u'can', u'den', u'wang', u'beng', u'zhuang', u'tan', u'tao', u'tai', u'eng', u'song', u'ping', u'hou', u'cuan', u'\u0148g', u'lan', u'lao', u'fu', u'fa', u'jiong', u'mai', u'xiang', u'mao', u'fo', u'a', u'jiang', u'kuang', u'bing', u'su', u'si', u'sa', u'se', u'zan', u'm\u0300', u'xuan', u'zei', u'zen', u'kong', u'pang', u'le', u'jia', u'jin', u'lo', u'lai', u'li', u'peng', u'lu', u'yi', u'yo', u'ya', u'cen', u'dan', u'dao', u'ye', u'din', u'cei', u'zhen', u'jiu', u'bang', u'nou', u'yu', u'weng', u'wong', u'en', u'ei', u'kang', u'dia', u'er', u'ru', u'keng', u're', u'ren', u'gou', u'ri', u'she', u'tian', u'tiao', u'que', u'shi', u'shun', u'shuo', u'qun', u'xue', u'yun', u'xun', u'fiao', u'yue', u'ding', u'zao', u'rang', u'xi', u'yong', u'zai', u'guan', u'guai', u'dong', u'kuai', u'ying', u'kuan', u'xu', u'xia', u'xie', u'yin', u'rong', u'xin', u'tou', u'nian', u'niao', u'xiu', u'man', u'kou', u'niang', u'hua', u'chao', u'hun', u'huo', u'hui', u'shuan', u'quan', u'shuai', u'chong', u'bei', u'ben', u'dang', u'sai', u'ang', u'sao', u'san', u'reng', u'ran', u'rao', u'ming', u'l\u01dc', u'l\u01da', u'l\u01d8', u'lie', u'lia', u'min', u'miao', u'mian', u'mie', u'liu', u'zou', u'miu', u'nen', u'kai', u'kao', u'kan', u'dai', u'ka', u'ke', u'yang', u'ku', u'deng', u'dou', u'shou', u'chuang', u'nang', u'feng', u'meng', u'cheng', u'di', u'de', u'da', u'gei', u'du', u'gen', u'qu', u'shu', u'sha', u'\u1e3f', u'ban', u'bao', u'bai', u'nun', u'nuo', u'sen', u'kei', u'fang', u'teng', u'lun', u'luo', u'ken', u'wa', u'wo', u'ju', u'tui', u'wu', u'jie', u'ji', u'huang', u'tuo', u'cou', u'la', u'mang', u'ci', u'tun', u'tong', u'ca', u'pou', u'ce', u'gong', u'cu', u'dui', u'dun', u'duo', u'ting', u'qie', u'yao', u'yan', u'pi', u'po', u'suan', u'chua', u'chun', u'\u0148', u'chui', u'gao', u'gan', u'ao', u'gai', u'xiong', u'tang', u'n', u'pian', u'piao', u'cang', u'heng', u'xian', u'xiao', u'bian', u'biao', u'zhua', u'duan', u'cong', u'zhui', u'zhuo', u'zhun', u'hong', u'shuang', u'juan', u'zhei', u'pai', u'shai', u'shan', u'shao', u'pan', u'pao', u'nin', u'nia', u'hang', u'\u01f9g', u'nie', u'zhuai', u'mou', u'zhuan', u'yuan', u'niu', u'zhong', u'qi', u'lin', u'guang', u'nao', u'n\u01d8', u'n\u01da', u'n\u01dc', u'hai', u'han', u'hao', u'wei', u'wen', u'ruan', u'cuo', u'cun', u'cui', u'bin', u'bie', u'l\xfce', u'shen', u'shei', u'fou', u'xing', u'\u0144g', u'qia', u'qiang', u'nuan', u'pen', u'pei', u'\u01f9', u'rui', u'run', u'ba', u'sheng', u'rua', u'bi', u'bu', u'chuan', u'qing', u'chuai', u'pu', u'o', u'chou', u'ou', u'zui', u'luan', u'zuo', u'jian', u'jiao', u'sou', u'wan', u'jing', u'qiong', u'wai', u'long', u'pa', u'liang', u'lou', u'huan', u'hen', u'hei', u'huai', u'n\xfce', u'\u0144', u'jue', u'shang', u'jun', u'hu', u'hm', u'ling', u'ha', u'he', u'zhu', u'ceng', u'zha', u'zhe', u'zhi', u'qin', u'pin', u'ai', u'chai', u'chan', u'pie', u'zeng', u'an', u'qiu', u'ni', u'na', u'zang', u'nai', u'nan', u'ne', u'ng', u'chuo', u'tie', u'you', u'nu', u'zheng', u'leng', u'zun', u'zhou', u'lang', u'e', u'hng'])
def pinyin_sub_gen(nodes, s, v):
for p, sub_node in nodes.iteritems():
for sub in pinyin_sub_gen(sub_node, p, v):
if s == p:
yield sub
else:
yield [v[s:p]] + sub
yield []
def pinyin_split(pinyin):
root = tree()
node = root[0]
nodes = [(0, node)]
while nodes:
new_nodes = []
for pos, node in nodes:
for p in py:
py_len = len(p)
if pinyin[pos:pos+py_len] == p:
new_nodes.append((pos+py_len, node[pos+py_len]))
nodes = new_nodes
#pprint(root)
candidates = list(pinyin_sub_gen(root, 0, pinyin))
candidates = sorted(candidates, key=lambda x: len(x), reverse=True)
return candidates[0]
class TestCase(unittest.TestCase):
def test_pinyin_dp(self):
v = "woshidewenfensi"
pprint(pinyin_split(v))
v = "woaibeijinganmen"
pprint(pinyin_split(v))
if __name__ == "__main__":
unittest.main()
woshidewenfensi -> ['wo', 'shi', 'de', 'wen', 'fen', 'si']
woaibeijinanmen -> ['wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me', 'n']
The existence of possible combinations #
[['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me', 'n'],
['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'me'],
['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n', 'men'],
['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'me', 'n'],
['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'me', 'n'],
['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'me', 'n'],
['Wo', 'ai', 'bei', 'ji', 'n', 'a', 'n'],
['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'me'],
['Wo', 'ai', 'bei', 'ji', 'n', 'an', 'men'],
['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'me'],
['Wo', 'ai', 'bei', 'ji', 'na', 'n', 'men'],
['Wo', 'ai', 'bei', 'ji', 'nan', 'me', 'n'],
['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'me'],
['Wo', 'ai', 'bei', 'jin', 'a', 'n', 'men'],
['Wo', 'ai', 'bei', 'jin', 'an', 'me', 'n'],
['Wo', 'ai', 'bei', 'ji', 'n', 'a'],
['Wo', 'ai', 'bei', 'ji', 'n', 'an'],
['Wo', 'ai', 'bei', 'ji', 'na', 'n'],
['Wo', 'ai', 'bei', 'ji', 'nan', 'me'],
['Wo', 'ai', 'bei', 'ji', 'nan', 'men'],
['Wo', 'ai', 'bei', 'jin', 'a', 'n'],
['Wo', 'ai', 'bei', 'jin', 'an', 'me'],
['Wo', 'ai', 'bei', 'jin', 'an', 'men'],
['Wo', 'ai', 'bei', 'ji', 'n'],
['Wo', 'ai', 'bei', 'ji', 'na'],
['Wo', 'ai', 'bei', 'ji', 'nan'],
['Wo', 'ai', 'bei', 'jin', 'a'],
['Wo', 'ai', 'bei', 'jin', 'an'],
['Wo', 'ai', 'bei', 'ji'],
['Wo', 'ai', 'bei', 'jin'],
['Wo', 'ai', 'bei'],
['Wo', 'a'],
['Wo', 'ai'],
['Wo'],
[],
[]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment