Skip to content

Instantly share code, notes, and snippets.

@indiejoseph
Created January 27, 2026 15:00
Show Gist options
  • Select an option

  • Save indiejoseph/75bbc33ff2e84846c9cd0924b6508503 to your computer and use it in GitHub Desktop.

Select an option

Save indiejoseph/75bbc33ff2e84846c9cd0924b6508503 to your computer and use it in GitHub Desktop.
import re
import unicodedata
ONSETS_JYUTPING = {
"b": "b",
"d": "d",
"g": "g",
"gw": "gw",
"j": "z",
"p": "p",
"t": "t",
"k": "k",
"ch": "c",
"m": "m",
"n": "n",
"ng": "ng",
"f": "f",
"h": "h",
"s": "s",
"l": "l",
"w": "w",
"y": "j",
"": "",
}
NUCLEI_JYUTPING = {
"aa": "aa",
"a": "a",
"i": "i",
"yu": "yu",
"u": "u",
"eu": "oe",
"e": "e",
"o": "o",
"m": "m",
"ng": "ng",
}
CODAS_JYUTPING = {
"p": "p",
"t": "t",
"k": "k",
"m": "m",
"n": "n",
"ng": "ng",
"i": "i",
"u": "u",
"": "",
}
onsets = list(ONSETS_JYUTPING.keys())
nuclei = list(NUCLEI_JYUTPING.keys())
codas = list(CODAS_JYUTPING.keys())
def parse_yale(yale):
normalized = unicodedata.normalize("NFD", yale)
base = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
has_h = "h" in base
if has_h:
base_no_h = base.replace("h", "")
else:
base_no_h = base
# Determine tone
tone = 3
if has_h:
if "\u0300" in normalized: # grave
tone = 6
elif "\u0301" in normalized: # acute
tone = 5
else:
tone = 4
else:
if "\u0304" in normalized: # macron
tone = 1
elif "\u0301" in normalized: # acute
tone = 2
elif "\u0300" in normalized: # grave
tone = 4
else:
tone = 3
# Parse base_no_h
onset = ""
remaining = base_no_h
for o in sorted(onsets, key=len, reverse=True):
if remaining.startswith(o):
onset = o
remaining = remaining[len(o) :]
break
nucleus = ""
coda = ""
for n in sorted(nuclei, key=len, reverse=True):
if remaining.startswith(n):
nucleus = n
coda = remaining[len(n) :]
if coda in codas:
return onset, nucleus, coda, tone
return None
def convert_yale_to_jyutping(yale):
if "/" in yale:
parts = yale.split("/")
converted = []
for p in parts:
converted.append(convert_single(p))
return "/".join(converted)
else:
return convert_single(yale)
def convert_single(yale):
if " " in yale:
# Multiple syllables
syllables = yale.split()
converted = []
for syl in syllables:
converted.append(convert_single_syl(syl))
return " ".join(converted)
else:
return convert_single_syl(yale)
def convert_single_syl(yale):
parsed = parse_yale(yale)
if parsed:
onset, nucleus, coda, tone = parsed
onset_jp = ONSETS_JYUTPING.get(onset, onset)
nucleus_jp = NUCLEI_JYUTPING.get(nucleus, nucleus)
if nucleus == "eu":
nucleus_jp = "oe"
# Special for long a
if nucleus == "a" and coda == "" and tone == 1:
nucleus_jp = "aa"
coda_jp = CODAS_JYUTPING.get(coda, coda)
return onset_jp + nucleus_jp + coda_jp + str(tone)
else:
# Fallback: simple conversion
original = yale
# Remove 'h' if present
yale = yale.replace("h", "")
# Determine tone based on original
if (
"à" in original
or "è" in original
or "ì" in original
or "ò" in original
or "ù" in original
):
tone = 6
elif (
"á" in original
or "é" in original
or "í" in original
or "ó" in original
or "ú" in original
):
tone = 5
elif (
"ā" in original
or "ē" in original
or "ī" in original
or "ō" in original
or "ū" in original
):
tone = 1
elif "h" in original:
tone = 4
else:
tone = 3
# Remove diacritics
yale = "".join(
c
for c in unicodedata.normalize("NFD", yale)
if unicodedata.category(c) != "Mn"
)
# Append tone
return yale + str(tone)
def convert_file(input_path, output_path):
with open(input_path, "r", encoding="utf-8") as f:
content = f.read()
# Find all **...** and convert the content
def replace_match(match):
yale = match.group(1)
jyutping = convert_yale_to_jyutping(yale)
return f"**{jyutping}**"
# Regex to match **...**
pattern = r"\*\*(.*?)\*\*"
new_content = re.sub(pattern, replace_match, content)
with open(output_path, "w", encoding="utf-8") as f:
f.write(new_content)
if __name__ == "__main__":
input_file = "/Users/josephcheng/Projects/wordshk_conversations/Dictionary of Cantonese Slang_extracted.md"
output_file = (
"/Users/josephcheng/Projects/wordshk_conversations/Dictionary_converted.md"
)
convert_file(input_file, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment