Skip to content

Instantly share code, notes, and snippets.

@yosshy
Last active September 26, 2021 13:04
Show Gist options
  • Save yosshy/2394c6e48bdf38dc5a6e3ace33767fc1 to your computer and use it in GitHub Desktop.
Save yosshy/2394c6e48bdf38dc5a6e3ace33767fc1 to your computer and use it in GitHub Desktop.
Japanese translation checker for minikube
import codecs
import json
import re
import string
CHO_ON_PATTERN = re.compile(r"[ア-ンー・]+[アカガサザタダナハバパマヤラワァィゥェォ][^ア-ン]")
ILLEGAL_CHARACTERS = re.compile("[ ({【『[)}】』]”’,.・0123456789:/-]")
ILLEGAL_KANJI_PATTERN = re.compile("(下さい|無し|為|する事)")
QUOTED_PATTERN = re.compile(r"'[^']+'")
DOUBLE_QUOTED_PATTERN = re.compile(r'"[^"]+"')
LETTERS = string.ascii_letters + string.punctuation
HANKAKU_SPACE_PATTERN1 = re.compile(f"[{LETTERS}][^{string.printable}]")
HANKAKU_SPACE_PATTERN2 = re.compile(f"[^{string.printable}][{LETTERS}]")
# 末尾に長音記号がないかどうか
def cho_on(source: str) -> str:
result = CHO_ON_PATTERN.findall(source)
if result is None:
return []
return [x[:-1] for x in result if x[-1] != "ー"]
# 強調・引用部分がないかどうか
def quoted(source: str) -> str:
result = QUOTED_PATTERN.findall(source)
result += DOUBLE_QUOTED_PATTERN.findall(source)
if result is None:
return []
return [x[:-1] for x in result if x[-1] != "ー"]
# 不正文字をつかっているかどうか
def illegal_characters(source: str) -> str:
result = ILLEGAL_CHARACTERS.findall(source)
if len(result) > 0:
return result
# 制限漢字表記があるかどうか
def illegal_kanji(source: str) -> str:
result = ILLEGAL_KANJI_PATTERN.findall(source)
if len(result) > 0:
return result
# 半角単語と全角単語の間に半角空白があるかどうか
def hankaku_space(source: str) -> str:
result = HANKAKU_SPACE_PATTERN1.findall(source)
result = [x for x in result if x[0] not in ["("]]
result = [x for x in result if x[1] not in ["」", "。", "、"]]
result2 = HANKAKU_SPACE_PATTERN2.findall(source)
result2 = [x for x in result2 if x[0] not in ["。", "、", "「"]]
result2 = [x for x in result2 if x[1] not in [":", ")"]]
if len(result) > 0:
return result
with open("ja.json", encoding="utf8") as f:
data = json.load(f)
sorted(data)
_count = 1
for english, japanese in data.items():
# 行数カウンタ
_count += 1
japanese2 = codecs.escape_decode(bytes(japanese, "utf-8"))[0]\
.decode("utf-8")
result = cho_on(japanese2)
if result:
print(f"{_count} {japanese}: 長音注意 ({str(result)[1:-1]})")
result = quoted(japanese2)
if result:
print(f"{_count} {japanese}: 強調・引用注意 ({str(result)[1:-1]})")
result = illegal_characters(japanese2)
if result:
print(f"{_count} {japanese}: 使用禁止文字あり ({str(result)[1:-1]})")
result = illegal_kanji(japanese2)
if result:
print(f"{_count} {japanese}: 使用禁止フレーズあり ({str(result)[1:-1]})")
result = hankaku_space(japanese2)
if result:
print(f"{_count} {japanese}: 半角スペース注意 ({str(result)[1:-1]})")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment