Skip to content

Instantly share code, notes, and snippets.

@Calvin-Xu
Last active June 27, 2025 22:48
Show Gist options
  • Save Calvin-Xu/2997891de442fcb603fe26928eb8f894 to your computer and use it in GitHub Desktop.
Save Calvin-Xu/2997891de442fcb603fe26928eb8f894 to your computer and use it in GitHub Desktop.
Generating reading pairs / furigana string
from enum import Enum
from typing import Dict, List, Tuple
from jaconv import kata2hira
import unicodedata
def generate_possible_kanji_reading_pairs(
text: str, reading: str
) -> List[List[Tuple[str, str]]]:
"""
Generates all possible kanji-reading pairs for a given text and reading.
Args:
text (str): The text to generate pairs for.
reading (str): The kana reading of the text.
Returns:
list: A list of lists of all possible (continuous block of) kanji - kana pairs.
"""
# ! it is impossible to always determine an unique reading from the arguments alone
# this function generates all valid readings (fully-aligned, each kanji block mapped to at least one kana)
# and greedily short readings of kanji are at the front of the list
# this is a good heuristic for short text
# (like a MeCab token, where there should not be any ambiguity in the first place)
# but fails on e.g., 鹿乃子のこのこ虎視眈々, しかのこのこのここしたんたん
# the valid furigana pairs returned in order are
# 鹿乃子(しか)のこのこ虎視眈々(のここしたんたん)
# 鹿乃子(しかのこ)のこのこ虎視眈々(こしたんたん) * correct
# a consumer of parses can check that each kanji block is assigned kana at least as long
# though this is again not guaranteed, and should have a fallback to the first parse
# e.g., 蝦虎魚, はぜ
def is_hiragana(ch):
return "HIRAGANA" in unicodedata.name(ch)
def is_katakana(ch):
return "KATAKANA" in unicodedata.name(ch)
def is_kana(char: str) -> bool:
if char in ["々", "ヶ", "ヵ", "〆"]:
return False
return is_hiragana(char) or is_katakana(char)
def is_kanji(char: str) -> bool:
# treat everything not kana as kanji
return not is_kana(char)
if not all(is_kana(char) for char in reading):
raise ValueError(f"generate_furigana:reading must be in kana: {reading}")
if len(text) == 0 or len(reading) == 0:
raise ValueError(
f"generate_furigana:text and reading must have length > 0: {text}, {reading}"
)
# recursive
# base cases
if all(is_kanji(char) for char in text) or all(is_kana(char) for char in text):
return [[(text, reading)]]
if is_kana(text[-1]) and text[-1] != reading[-1]:
return None
class States(Enum):
START = 1
KANA = 2
KANJI = 3
END = 4
state = States.START
current_kanji_block = ""
text, reading = kata2hira(text), kata2hira(reading)
results = []
while text and reading:
# print(text, reading, state)
match state:
case States.START:
if is_kana(text[-1]):
state = States.KANA
text, reading = text[:-1], reading[:-1]
else:
state = States.KANJI
case States.KANA:
if is_kanji(text[-1]):
state = States.KANJI
else:
text, reading = text[:-1], reading[:-1]
state = States.KANA
case States.KANJI:
if all(is_kanji(char) for char in text):
results = [[(text, reading)]]
state = States.END
else:
current_kanji_block = text[-1] + current_kanji_block
if is_kana(text[-2]):
# find and split on longest preceding kana block in text
preceding_kana_block = ""
for char in reversed(text[:-1]):
if is_kana(char):
preceding_kana_block = char + preceding_kana_block
else:
break
# find all possible binary splits on kana block
splits = [] # shortest splits first
for j in range(len(preceding_kana_block) - 1, len(reading)):
if (
reading[j + 1 - len(preceding_kana_block) : j + 1]
== preceding_kana_block
):
split = (reading[: j + 1], reading[j + 1 :])
splits.append(split)
# print(splits)
results = [
result + [(current_kanji_block, split[1])]
for split in splits
for result in generate_possible_kanji_reading_pairs(
text[:-1], split[0]
)
if result is not None
]
# print(results)
state = States.END
else:
state = States.KANJI
text = text[:-1]
case States.END:
break
return results
def generate_furigana(
text: str,
reading: str,
delimiters: Dict[str, Tuple[str, str]],
min_reading_len=True,
) -> str:
def replace_first(text, lemma, reading):
index = text.find(lemma)
if index != -1:
before = text[:index]
after = text[index + len(lemma) :]
ruby_text = f"{delimiters['ruby'][0]}{lemma}{delimiters['rt'][0]}{reading}{delimiters['rt'][1]}{delimiters['ruby'][1]}"
return before + ruby_text, after
return text, ""
_text = ("", text)
results = generate_possible_kanji_reading_pairs(text, reading)
if results is None or len(results) == 0:
raise ValueError(
f"generate_furigana: no valid configuration found for {text}, {reading}"
)
pairs = []
if min_reading_len:
pairs = next(
(
result
for result in results
if all(len(pair[0]) <= len(pair[1]) for pair in result)
),
results[0],
)
else:
pairs = results[0]
for pair in pairs:
lemma, reading = pair
left, right = replace_first(_text[1], lemma, reading)
_text = (_text[0] + left, right)
return "".join(_text)
def test_furigana():
tests = [
("持ち力と届かない", "もちちからととどかない"),
("持ち越し", "もちこし"),
("子", "こ"),
("朽ちる", "くちる"),
("房々", "ふさふさ"),
("蛮殻", "バンカラ"),
("がぶ飲み", "がぶのみ"),
("已んぬる哉", "やんぬるかな"),
("付きっ切り", "つきっきり"),
("歯が痛いので歯科医に診てもらった", "はがいたいのでしかいにみてもらった"),
("鹿乃子のこのこ虎視眈々", "しかのこのこのここしたんたん"),
(
"斜め七十七度の並びで泣く泣く嘶くナナハン七台難なく並べて長眺め",
"ななめななじゅうななどのならびでなくなくいななくななはんななだいなんなくならべてながながめ",
),
("由比ヶ浜結衣", "ゆいがはまゆい"),
("雪ノ下雪乃", "ゆきのしたゆきの"),
("蝦虎魚", "はぜ"),
]
delimiters = {"ruby": ("<ruby>", "</ruby>"), "rt": ("<rt>", "</rt>")}
for test in tests:
print(generate_furigana(test[0], test[1], delimiters, min_reading_len=True))
# <ruby>持<rt>も</rt></ruby>ち<ruby>力<rt>ちから</rt></ruby>と<ruby>届<rt>とど</rt></ruby>かない
# <ruby>持<rt>も</rt></ruby>ち<ruby>越<rt>こ</rt></ruby>し
# <ruby>子<rt>こ</rt></ruby>
# <ruby>朽<rt>く</rt></ruby>ちる
# <ruby>房々<rt>ふさふさ</rt></ruby>
# <ruby>蛮殻<rt>バンカラ</rt></ruby>
# <ruby>がぶ<rt>がぶ</rt></ruby><ruby>飲<rt>の</rt></ruby>み
# <ruby>已<rt>や</rt></ruby>んぬる<ruby>哉<rt>かな</rt></ruby>
# <ruby>付<rt>つ</rt></ruby>きっ<ruby>切<rt>き</rt></ruby>り
# <ruby>歯<rt>は</rt></ruby>が<ruby>痛<rt>いた</rt></ruby>いので<ruby>歯科医<rt>しかい</rt></ruby>に<ruby>診<rt>み</rt></ruby>てもらった
# <ruby>鹿乃子<rt>しかのこ</rt></ruby>のこのこ<ruby>虎視眈々<rt>こしたんたん</rt></ruby>
# <ruby>斜<rt>なな</rt></ruby>め<ruby>七十七度<rt>ななじゅうななど</rt></ruby>の<ruby>並<rt>なら</rt></ruby>びで<ruby>泣<rt>な</rt></ruby>く<ruby>泣<rt>な</rt></ruby>く<ruby>嘶<rt>いなな</rt></ruby>くナナハン<ruby>七台難<rt>ななだいなん</rt></ruby>なく<ruby>並<rt>なら</rt></ruby>べて<ruby>長眺<rt>ながなが</rt></ruby>め
# <ruby>由比ヶ浜結衣<rt>ゆいがはまゆい</rt></ruby>
# <ruby>雪<rt>ゆき</rt></ruby>ノ<ruby>下雪乃<rt>したゆきの</rt></ruby>
# <ruby>蝦虎魚<rt>はぜ</rt></ruby>
def main():
test_furigana()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment