Skip to content

Instantly share code, notes, and snippets.

View korakot's full-sized avatar

Korakot Chaovavanich korakot

  • Bangkok, Thailand
View GitHub Profile
@korakot
korakot / pyenchant.py
Last active June 30, 2017 15:08
ตรวจคำสะกด โดยทำ word list เอง
# ลง hunspell + python ไม่สำเร็จ
# แต่มาเจอ pyenchant ที่ใช้ได้เหมือนกัน และมี custom word list ด้วย
>>> import enchant
>>> pwl = enchant.request_pwl_dict("royin_2542_wordlist.txt")
>>> pwl.check("กรกฎ")
True
>>> pwl.check("กรกำ")
False
>>> pwl.suggest("กรกำ")
@korakot
korakot / thaisort.py
Created July 13, 2017 09:57
Thai Sort
import icu
thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey
words = 'ไก่ ไข่ ก ฮา'.split()
print(sorted(words, key=thkey)) # ['ก', 'ไก่', 'ไข่', 'ฮา']
@korakot
korakot / spell.py
Created July 19, 2017 04:35
Check spelling
>>> import enchant
>>> d = enchant.request_pwl_dict("royin_2542_wordlist.txt")
>>> d.check("กรกำ")
False
>>> d.suggest("กรกำ")
['กรก', 'กรกฎ', 'กรกฏ', 'กรกช', 'กรำ', 'กร่ำ']
@korakot
korakot / els.py
Last active July 31, 2017 10:54
Elasticearch IPython magic. Change to %%els to avoid collision. Allow # comment
# From https://github.com/graphaelli/ipython-elasticsearch
import json
import os, re
import urllib.parse
from IPython.core.magic import Magics, magics_class, line_cell_magic
import requests
@korakot
korakot / multicut.py
Last active February 1, 2018 09:39
Return all possible ways to cut(tokenize) Thai text.
import re
from collections import defaultdict
from marisa_trie import Trie
wordlist = [li.strip() for li in open('wordlist.txt')]
trie = Trie(wordlist) # สร้างครั้งเดียว ข้างนอก function
class LatticeString(str):
''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
'''
@korakot
korakot / dot.py
Created August 18, 2017 07:06
Print dot to show progress in Jupyter Notebook
print('.', end='\u200b')
# here \u200b is the zero-width space. It helps wrapping lines.
@korakot
korakot / thai_id.py
Created August 26, 2017 09:37
Checking Thai ID number
# https://medium.com/earthchie/9af96b5f5588
num = '3-0000-00000-00-5'
nums = [int(i) for i in num.replace('-','')] # remove dash
prod = sum(i*j for i,j in zip(nums, range(13,1,-1)))
x = (11-prod%11)%10
assert x==nums[-1] # must equal 5
@korakot
korakot / am.py
Created August 30, 2017 01:46
Correct thai vowel "am" sequence
import re
s = re.sub('ํ([่-๋]?)า','\\1ำ', s) # กรณี ํ + า
s = re.sub('ำ([่-๋])', '\\1ำ', s) # กรณี ำ + ้
s=='น้ำ'
@korakot
korakot / lmcut.py
Last active January 18, 2020 06:43
Longest matching Thai word tokenization
from marisa_trie import Trie
# wordlist = ...
trie = Trie(wordlist)
def lmcut(text):
for w in reversed(trie.prefixes(text)):
if w==text:
yield [w]
else:
@korakot
korakot / unicode.py
Created October 26, 2017 11:47
Working with unicode for both python 2 and 3
from __future__ import unicode_literals # at top of module
s1 = 'The Zen of Python' # unicode for both python 2,3
b2 = b'xxxxxxxxxxxxxx' # byte string for both py 2,3
b1 = s1.encode() # to bytes, using utf-8 to encode
s2 = b2.decode() # now become unicode