Created
May 18, 2020 07:32
-
-
Save mkyt/f500ac3a0fad1918f89baa80bd2fdcb9 to your computer and use it in GitHub Desktop.
法令漢数字→ローマ数字
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from kanjize import kanji2int | |
import re | |
exprs = [ | |
(r'月([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)日', 1), # 三月二十八日 | |
(r'年([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)月', 1), # 昭和四十二年三月 | |
(r'(明治|大正|昭和|平成|令和)([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)年', 2), # 昭和四十二年 | |
(r'第([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)(章|節|条|項|号)', 1), # 第二十三条 | |
(r'[〇一二三四五六七八九十壱弐参拾百千万萬億兆]+の([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # 第十九条の六の十五 | |
(r'条の([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # 第十九条の六 | |
(r'^([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # <行頭>二 | |
(r'^第([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)', 1), # <行頭>第二 | |
(r'^\(([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)\)', 1), # <行頭>(三) | |
(r'([〇一二三四五六七八九十壱弐参拾百千万萬億兆]+)(週|日|時間|分|秒|回)', 1), # 七十二時間 | |
] | |
knums = '〇一二三四五六七八九' | |
kn2n = {} | |
for i, k in enumerate(knums): kn2n[k] = str(i) | |
def is_complex(kn): | |
'''whether given string is a complex kanji number w/ rank characters''' | |
for k in kn: | |
if k in '十壱弐参拾百千万萬億兆': | |
return True | |
return False | |
def gen_k2i(arg_n): | |
def res(m): | |
k = m.group(arg_n) | |
if is_complex(k): | |
i = str(kanji2int(k)) | |
else: | |
# treat as a simple list of kanji numerals e.g. "第一三〇号" | |
i = ''.join(kn2n[kn] for kn in k) | |
return m.group(0).replace(k, i) | |
return res | |
def subst(s): | |
for expr, pos in exprs: | |
pat = re.compile(expr, flags=re.MULTILINE) | |
s = re.sub(pat, gen_k2i(pos), s) | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment