Last active
April 22, 2016 10:42
-
-
Save shunsukeaihara/6448394 to your computer and use it in GitHub Desktop.
rongorongoのThomas Barthel's Transliteration Systemのデータをクローリングしてファイルに保存したり、文字の正規化や分解を行うスクリプト。詳細は以下 http://argmax.jp/index.php?ron
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import re | |
URL = "http://kohaumotu.org/rongorongo_org/translit/%s.html" | |
for i in range(97,123): | |
url = URL % chr(i) | |
html = urllib2.urlopen(url).read() | |
s = BeautifulSoup(html) | |
for li in s.findAll('li'): | |
line = str(li.text) | |
line = line.strip() | |
line = re.sub(r'^[A-Z][a-z]\d\d','',line) | |
line = line.strip() | |
print line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
import optparse | |
REMOVE_SUFFIX = False | |
REMOVE_MERGE = False | |
SPLIT_CHARACTOR = False | |
FORCE_JOIN_STACK = False | |
FORCE_JOIN_MERGE = False | |
def single_word(word): | |
#数字だけ | |
NUMERIC_ONLY = re.compile('^(\d+)$') | |
#数字+記号一文字 | |
NUMERIC_AND_CHAR = re.compile('^(\d+)(\D+)$') | |
m = NUMERIC_ONLY.match(word) | |
if m:#数字だけ | |
return "%03d"%int(m.group(1)) | |
m = NUMERIC_AND_CHAR.match(word) | |
if m:#数字+記号一文字 | |
numeric = "%03d"%int(m.group(1)) | |
c = m.group(2) | |
if REMOVE_SUFFIX: | |
return numeric | |
else: | |
return numeric+c | |
return word | |
def stack_and_merge(word): | |
""" | |
.で分けた後に、上下結合(;)と部分結合(;)を分ける | |
""" | |
if word.find(':')>-1:#上下の複合文字(stack) | |
ar = word.split(':') | |
stacks = [single_word(w) for w in ar] | |
if SPLIT_CHARACTOR and not FORCE_JOIN_STACK: | |
return stacks | |
else: | |
return [":".join(stacks)] | |
elif word.find(";")>-1:#部分結合の複合文字(merge) | |
ar = word.split(';') | |
merge = [single_word(w) for w in ar] | |
if REMOVE_MERGE: | |
merge = [merge[0]] | |
if SPLIT_CHARACTOR and not FORCE_JOIN_MERGE: | |
return merge | |
else: | |
return [";".join(merge)] | |
else: | |
return [single_word(word)] | |
def split_linking(word): | |
""" | |
.で繋がったものを分けて一文字ずつ処理する | |
""" | |
ar = word.split('.') | |
ret = [] | |
for w in ar: | |
r = stack_and_merge(w) | |
ret+=r | |
return ret | |
def pattern(word): | |
if word.find('.')>-1:#ピリオドで単語がつながっている場合 | |
if SPLIT_CHARACTOR: | |
return split_linking(word) | |
else: | |
return [".".join(split_linking(word))] | |
else: | |
res = stack_and_merge(word)#この時点ではlist | |
return res | |
def split_line(line): | |
ret = [] | |
buf = "" | |
parencount = 0 | |
for w in line: | |
if w=="(": | |
parencount+=1 | |
buf+=w | |
elif w==")": | |
parencount-=1 | |
buf+=w | |
elif w=="-": | |
if parencount==0: | |
if buf!="": | |
ret.append(buf) | |
buf="" | |
else: | |
buf+=w | |
else: | |
buf+=w | |
if buf!="": | |
ret.append(buf) | |
return ret | |
if __name__=="__main__": | |
p = optparse.OptionParser() | |
p.add_option('-s', '--splite', action="store_true", dest='split_charactor',help="SPLIT CHARACTOR",default=False) | |
p.add_option('-r', '--rsuffix', action="store_true", dest='remove_suffix',help="REMOVE SUFFIX",default=False) | |
p.add_option('-m', '--rmerge', action="store_true", dest='remove_merge',help="REMOVE MERGE",default=False) | |
p.add_option('-f', '--fjoinstack', action="store_true", dest='force_join_stack',help="FORCE JOIN STACK IF SET SPLITE CHARACTOR",default=False) | |
p.add_option('-j', '--fjoinmerge', action="store_true", dest='force_join_merge',help="FORCE JOIN MERGE IF SET SPLITE CHARACTOR",default=False) | |
opts, args = p.parse_args() | |
SPLIT_CHARACTOR = opts.split_charactor | |
REMOVE_MERGE = opts.remove_merge | |
REMOVE_SUFFIX = opts.remove_suffix | |
FORCE_JOIN_MERGE = opts.force_join_merge | |
FORCE_JOIN_STACK = opts.force_join_stack | |
for line in sys.stdin: | |
line = line.strip() | |
ar = split_line(line) | |
for w in ar: | |
w = w.strip() | |
if w!="": | |
r = pattern(w) | |
for x in r: | |
print x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment