Skip to content

Instantly share code, notes, and snippets.

@whs
Created July 1, 2017 06:18
Show Gist options
  • Select an option

  • Save whs/5f512b834082bb4676fc77e81bdfae6e to your computer and use it in GitHub Desktop.

Select an option

Save whs/5f512b834082bb4676fc77e81bdfae6e to your computer and use it in GitHub Desktop.
def split_icu(txt):
import PyICU
def is_tha(c):
return ord(c) >= 0x0E00 and ord(c) <= 0x0E7F
def merge(ans, tok):
if is_tha(tok[0]) or len(ans[:-1]) == 0 or is_tha(ans[-1][0]):
return ans + [tok]
else:
return ans[:-1] + [ans[-1] + tok]
icu_txt = PyICU.UnicodeString(txt)
brk_iter = PyICU.BreakIterator.createWordInstance(PyICU.Locale("th"))
brk_iter.setText(icu_txt)
brk_e = [i for i in brk_iter]
brk_s = [0] + brk_e[:-1]
ans = [unicode(icu_txt[i[0]:i[1]]) for i in zip(brk_s, brk_e)]
return filter(lambda t: t != "", reduce(merge, ans, []))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment