Last active
June 10, 2019 08:45
-
-
Save kzinmr/0abe0d898bc6ca33929229c4f91a14f0 to your computer and use it in GitHub Desktop.
漢数字を数字に直す (revised from https://qiita.com/dosec/items/c6aef40fae6977fd89ab)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123') | |
re_suji = re.compile(r'[十拾百千万億兆\d]+') | |
re_kunit = re.compile(r'[十拾百千]|\d+') | |
re_manshin = re.compile(r'[万億兆]|[^万億兆]+') | |
TRANSUNIT = {'十': 10, | |
'拾': 10, | |
'百': 100, | |
'千': 1000} | |
TRANSMANS = {'万': 10000, | |
'億': 100000000, | |
'兆': 1000000000000} | |
TRANSWA = {'ひと': '1', | |
'ふた': '2', | |
'みっ': '3', | |
'よっ': '4', | |
'いつ': '5', | |
'むっ': '6', | |
'なな': '7', | |
'やっ': '8', | |
'ここの': '9'} | |
def kansuji_to_arabic_numeral(kstring, sep=False): | |
"""漢数字をアラビア数字に変換""" | |
def _transvalue(sj, re_obj=re_kunit, transdic=TRANSUNIT): | |
unit = 1 | |
result = 0 | |
for piece in reversed(re_obj.findall(sj)): | |
if piece in transdic: | |
if unit > 1: | |
result += unit | |
unit = transdic[piece] | |
else: | |
val = int(piece) if piece.isdecimal() else _transvalue(piece) | |
result += val * unit | |
unit = 1 | |
if unit > 1: | |
result += unit | |
return result | |
# 文字 -> [0-9] | |
transuji = kstring.translate(tt_ksuji) | |
# 10^nの位 | |
for suji in sorted(set(re_suji.findall(transuji)), key=lambda s: -len(s)): | |
if not suji.isdecimal(): | |
arabic = _transvalue(suji, re_manshin, TRANSMANS) | |
arabic = '{:,}'.format(arabic) if sep else str(arabic) | |
transuji = transuji.replace(suji, arabic) | |
return transuji | |
def wasuji_to_arabic_numeral(wstring): | |
# 和数字 -> [1-9] | |
if wstring in TRANSWA: | |
suji = TRANSWA[wstring] | |
if suji.isdecimal(): | |
return suji | |
return wstring |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment