Created
July 1, 2017 06:18
-
-
Save whs/5f512b834082bb4676fc77e81bdfae6e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def split_icu(txt): | |
| import PyICU | |
| def is_tha(c): | |
| return ord(c) >= 0x0E00 and ord(c) <= 0x0E7F | |
| def merge(ans, tok): | |
| if is_tha(tok[0]) or len(ans[:-1]) == 0 or is_tha(ans[-1][0]): | |
| return ans + [tok] | |
| else: | |
| return ans[:-1] + [ans[-1] + tok] | |
| icu_txt = PyICU.UnicodeString(txt) | |
| brk_iter = PyICU.BreakIterator.createWordInstance(PyICU.Locale("th")) | |
| brk_iter.setText(icu_txt) | |
| brk_e = [i for i in brk_iter] | |
| brk_s = [0] + brk_e[:-1] | |
| ans = [unicode(icu_txt[i[0]:i[1]]) for i in zip(brk_s, brk_e)] | |
| return filter(lambda t: t != "", reduce(merge, ans, [])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment