indiejoseph · January 27, 2026 15:00
diff --git a/yale_to_jyutping.py b/yale_to_jyutping.py
 import re
 import unicodedata

 ONSETS_JYUTPING = {
    "b": "b",
    "d": "d",
    "g": "g",
    "gw": "gw",
    "j": "z",
    "p": "p",
    "t": "t",
    "k": "k",
    "ch": "c",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "f": "f",
    "h": "h",
    "s": "s",
    "l": "l",
    "w": "w",
    "y": "j",
    "": "",
 }

 NUCLEI_JYUTPING = {
    "aa": "aa",
    "a": "a",
    "i": "i",
    "yu": "yu",
    "u": "u",
    "eu": "oe",
    "e": "e",
    "o": "o",
    "m": "m",
    "ng": "ng",
 }

 CODAS_JYUTPING = {
    "p": "p",
    "t": "t",
    "k": "k",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "i": "i",
    "u": "u",
    "": "",
 }

 onsets = list(ONSETS_JYUTPING.keys())
 nuclei = list(NUCLEI_JYUTPING.keys())
 codas = list(CODAS_JYUTPING.keys())


 def parse_yale(yale):
    normalized = unicodedata.normalize("NFD", yale)
    base = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
    has_h = "h" in base
    if has_h:
        base_no_h = base.replace("h", "")
    else:
        base_no_h = base

    # Determine tone
    tone = 3
    if has_h:
        if "\u0300" in normalized:  # grave
            tone = 6
        elif "\u0301" in normalized:  # acute
            tone = 5
        else:
            tone = 4
    else:
        if "\u0304" in normalized:  # macron
            tone = 1
        elif "\u0301" in normalized:  # acute
            tone = 2
        elif "\u0300" in normalized:  # grave
            tone = 4
        else:
            tone = 3

    # Parse base_no_h
    onset = ""
    remaining = base_no_h
    for o in sorted(onsets, key=len, reverse=True):
        if remaining.startswith(o):
            onset = o
            remaining = remaining[len(o) :]
            break

    nucleus = ""
    coda = ""
    for n in sorted(nuclei, key=len, reverse=True):
        if remaining.startswith(n):
            nucleus = n
            coda = remaining[len(n) :]
            if coda in codas:
                return onset, nucleus, coda, tone
    return None


 def convert_yale_to_jyutping(yale):
    if "/" in yale:
        parts = yale.split("/")
        converted = []
        for p in parts:
            converted.append(convert_single(p))
        return "/".join(converted)
    else:
        return convert_single(yale)


 def convert_single(yale):
    if " " in yale:
        # Multiple syllables
        syllables = yale.split()
        converted = []
        for syl in syllables:
            converted.append(convert_single_syl(syl))
        return " ".join(converted)
    else:
        return convert_single_syl(yale)


 def convert_single_syl(yale):
    parsed = parse_yale(yale)
    if parsed:
        onset, nucleus, coda, tone = parsed
        onset_jp = ONSETS_JYUTPING.get(onset, onset)
        nucleus_jp = NUCLEI_JYUTPING.get(nucleus, nucleus)
        if nucleus == "eu":
            nucleus_jp = "oe"
        # Special for long a
        if nucleus == "a" and coda == "" and tone == 1:
            nucleus_jp = "aa"
        coda_jp = CODAS_JYUTPING.get(coda, coda)
        return onset_jp + nucleus_jp + coda_jp + str(tone)
    else:
        # Fallback: simple conversion
        original = yale
        # Remove 'h' if present
        yale = yale.replace("h", "")
        # Determine tone based on original
        if (
            "à" in original
            or "è" in original
            or "ì" in original
            or "ò" in original
            or "ù" in original
        ):
            tone = 6
        elif (
            "á" in original
            or "é" in original
            or "í" in original
            or "ó" in original
            or "ú" in original
        ):
            tone = 5
        elif (
            "ā" in original
            or "ē" in original
            or "ī" in original
            or "ō" in original
            or "ū" in original
        ):
            tone = 1
        elif "h" in original:
            tone = 4
        else:
            tone = 3
        # Remove diacritics
        yale = "".join(
            c
            for c in unicodedata.normalize("NFD", yale)
            if unicodedata.category(c) != "Mn"
        )
        # Append tone
        return yale + str(tone)


 def convert_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Find all **...** and convert the content
    def replace_match(match):
        yale = match.group(1)
        jyutping = convert_yale_to_jyutping(yale)
        return f"**{jyutping}**"

    # Regex to match **...**
    pattern = r"\*\*(.*?)\*\*"
    new_content = re.sub(pattern, replace_match, content)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(new_content)


 if __name__ == "__main__":
    input_file = "/Users/josephcheng/Projects/wordshk_conversations/Dictionary of Cantonese Slang_extracted.md"
    output_file = (
        "/Users/josephcheng/Projects/wordshk_conversations/Dictionary_converted.md"
    )
    convert_file(input_file, output_file)
	import re
	import unicodedata

	ONSETS_JYUTPING = {
	"b": "b",
	"d": "d",
	"g": "g",
	"gw": "gw",
	"j": "z",
	"p": "p",
	"t": "t",
	"k": "k",
	"ch": "c",
	"m": "m",
	"n": "n",
	"ng": "ng",
	"f": "f",
	"h": "h",
	"s": "s",
	"l": "l",
	"w": "w",
	"y": "j",
	"": "",
	}

	NUCLEI_JYUTPING = {
	"aa": "aa",
	"a": "a",
	"i": "i",
	"yu": "yu",
	"u": "u",
	"eu": "oe",
	"e": "e",
	"o": "o",
	"m": "m",
	"ng": "ng",
	}

	CODAS_JYUTPING = {
	"p": "p",
	"t": "t",
	"k": "k",
	"m": "m",
	"n": "n",
	"ng": "ng",
	"i": "i",
	"u": "u",
	"": "",
	}

	onsets = list(ONSETS_JYUTPING.keys())
	nuclei = list(NUCLEI_JYUTPING.keys())
	codas = list(CODAS_JYUTPING.keys())


	def parse_yale(yale):
	normalized = unicodedata.normalize("NFD", yale)
	base = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
	has_h = "h" in base
	if has_h:
	base_no_h = base.replace("h", "")
	else:
	base_no_h = base

	# Determine tone
	tone = 3
	if has_h:
	if "\u0300" in normalized: # grave
	tone = 6
	elif "\u0301" in normalized: # acute
	tone = 5
	else:
	tone = 4
	else:
	if "\u0304" in normalized: # macron
	tone = 1
	elif "\u0301" in normalized: # acute
	tone = 2
	elif "\u0300" in normalized: # grave
	tone = 4
	else:
	tone = 3

	# Parse base_no_h
	onset = ""
	remaining = base_no_h
	for o in sorted(onsets, key=len, reverse=True):
	if remaining.startswith(o):
	onset = o
	remaining = remaining[len(o) :]
	break

	nucleus = ""
	coda = ""
	for n in sorted(nuclei, key=len, reverse=True):
	if remaining.startswith(n):
	nucleus = n
	coda = remaining[len(n) :]
	if coda in codas:
	return onset, nucleus, coda, tone
	return None


	def convert_yale_to_jyutping(yale):
	if "/" in yale:
	parts = yale.split("/")
	converted = []
	for p in parts:
	converted.append(convert_single(p))
	return "/".join(converted)
	else:
	return convert_single(yale)


	def convert_single(yale):
	if " " in yale:
	# Multiple syllables
	syllables = yale.split()
	converted = []
	for syl in syllables:
	converted.append(convert_single_syl(syl))
	return " ".join(converted)
	else:
	return convert_single_syl(yale)


	def convert_single_syl(yale):
	parsed = parse_yale(yale)
	if parsed:
	onset, nucleus, coda, tone = parsed
	onset_jp = ONSETS_JYUTPING.get(onset, onset)
	nucleus_jp = NUCLEI_JYUTPING.get(nucleus, nucleus)
	if nucleus == "eu":
	nucleus_jp = "oe"
	# Special for long a
	if nucleus == "a" and coda == "" and tone == 1:
	nucleus_jp = "aa"
	coda_jp = CODAS_JYUTPING.get(coda, coda)
	return onset_jp + nucleus_jp + coda_jp + str(tone)
	else:
	# Fallback: simple conversion
	original = yale
	# Remove 'h' if present
	yale = yale.replace("h", "")
	# Determine tone based on original
	if (
	"à" in original
	or "è" in original
	or "ì" in original
	or "ò" in original
	or "ù" in original
	):
	tone = 6
	elif (
	"á" in original
	or "é" in original
	or "í" in original
	or "ó" in original
	or "ú" in original
	):
	tone = 5
	elif (
	"ā" in original
	or "ē" in original
	or "ī" in original
	or "ō" in original
	or "ū" in original
	):
	tone = 1
	elif "h" in original:
	tone = 4
	else:
	tone = 3
	# Remove diacritics
	yale = "".join(
	c
	for c in unicodedata.normalize("NFD", yale)
	if unicodedata.category(c) != "Mn"
	)
	# Append tone
	return yale + str(tone)


	def convert_file(input_path, output_path):
	with open(input_path, "r", encoding="utf-8") as f:
	content = f.read()

	# Find all ... and convert the content
	def replace_match(match):
	yale = match.group(1)
	jyutping = convert_yale_to_jyutping(yale)
	return f"{jyutping}"

	# Regex to match ...
	pattern = r"\\(.?)\\*"
	new_content = re.sub(pattern, replace_match, content)

	with open(output_path, "w", encoding="utf-8") as f:
	f.write(new_content)


	if __name__ == "__main__":
	input_file = "/Users/josephcheng/Projects/wordshk_conversations/Dictionary of Cantonese Slang_extracted.md"
	output_file = (
	"/Users/josephcheng/Projects/wordshk_conversations/Dictionary_converted.md"
	)
	convert_file(input_file, output_file)
No results found