Created
January 27, 2026 15:00
-
-
Save indiejoseph/75bbc33ff2e84846c9cd0924b6508503 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import unicodedata | |
| ONSETS_JYUTPING = { | |
| "b": "b", | |
| "d": "d", | |
| "g": "g", | |
| "gw": "gw", | |
| "j": "z", | |
| "p": "p", | |
| "t": "t", | |
| "k": "k", | |
| "ch": "c", | |
| "m": "m", | |
| "n": "n", | |
| "ng": "ng", | |
| "f": "f", | |
| "h": "h", | |
| "s": "s", | |
| "l": "l", | |
| "w": "w", | |
| "y": "j", | |
| "": "", | |
| } | |
| NUCLEI_JYUTPING = { | |
| "aa": "aa", | |
| "a": "a", | |
| "i": "i", | |
| "yu": "yu", | |
| "u": "u", | |
| "eu": "oe", | |
| "e": "e", | |
| "o": "o", | |
| "m": "m", | |
| "ng": "ng", | |
| } | |
| CODAS_JYUTPING = { | |
| "p": "p", | |
| "t": "t", | |
| "k": "k", | |
| "m": "m", | |
| "n": "n", | |
| "ng": "ng", | |
| "i": "i", | |
| "u": "u", | |
| "": "", | |
| } | |
| onsets = list(ONSETS_JYUTPING.keys()) | |
| nuclei = list(NUCLEI_JYUTPING.keys()) | |
| codas = list(CODAS_JYUTPING.keys()) | |
| def parse_yale(yale): | |
| normalized = unicodedata.normalize("NFD", yale) | |
| base = "".join(c for c in normalized if unicodedata.category(c) != "Mn") | |
| has_h = "h" in base | |
| if has_h: | |
| base_no_h = base.replace("h", "") | |
| else: | |
| base_no_h = base | |
| # Determine tone | |
| tone = 3 | |
| if has_h: | |
| if "\u0300" in normalized: # grave | |
| tone = 6 | |
| elif "\u0301" in normalized: # acute | |
| tone = 5 | |
| else: | |
| tone = 4 | |
| else: | |
| if "\u0304" in normalized: # macron | |
| tone = 1 | |
| elif "\u0301" in normalized: # acute | |
| tone = 2 | |
| elif "\u0300" in normalized: # grave | |
| tone = 4 | |
| else: | |
| tone = 3 | |
| # Parse base_no_h | |
| onset = "" | |
| remaining = base_no_h | |
| for o in sorted(onsets, key=len, reverse=True): | |
| if remaining.startswith(o): | |
| onset = o | |
| remaining = remaining[len(o) :] | |
| break | |
| nucleus = "" | |
| coda = "" | |
| for n in sorted(nuclei, key=len, reverse=True): | |
| if remaining.startswith(n): | |
| nucleus = n | |
| coda = remaining[len(n) :] | |
| if coda in codas: | |
| return onset, nucleus, coda, tone | |
| return None | |
| def convert_yale_to_jyutping(yale): | |
| if "/" in yale: | |
| parts = yale.split("/") | |
| converted = [] | |
| for p in parts: | |
| converted.append(convert_single(p)) | |
| return "/".join(converted) | |
| else: | |
| return convert_single(yale) | |
| def convert_single(yale): | |
| if " " in yale: | |
| # Multiple syllables | |
| syllables = yale.split() | |
| converted = [] | |
| for syl in syllables: | |
| converted.append(convert_single_syl(syl)) | |
| return " ".join(converted) | |
| else: | |
| return convert_single_syl(yale) | |
| def convert_single_syl(yale): | |
| parsed = parse_yale(yale) | |
| if parsed: | |
| onset, nucleus, coda, tone = parsed | |
| onset_jp = ONSETS_JYUTPING.get(onset, onset) | |
| nucleus_jp = NUCLEI_JYUTPING.get(nucleus, nucleus) | |
| if nucleus == "eu": | |
| nucleus_jp = "oe" | |
| # Special for long a | |
| if nucleus == "a" and coda == "" and tone == 1: | |
| nucleus_jp = "aa" | |
| coda_jp = CODAS_JYUTPING.get(coda, coda) | |
| return onset_jp + nucleus_jp + coda_jp + str(tone) | |
| else: | |
| # Fallback: simple conversion | |
| original = yale | |
| # Remove 'h' if present | |
| yale = yale.replace("h", "") | |
| # Determine tone based on original | |
| if ( | |
| "à" in original | |
| or "è" in original | |
| or "ì" in original | |
| or "ò" in original | |
| or "ù" in original | |
| ): | |
| tone = 6 | |
| elif ( | |
| "á" in original | |
| or "é" in original | |
| or "í" in original | |
| or "ó" in original | |
| or "ú" in original | |
| ): | |
| tone = 5 | |
| elif ( | |
| "ā" in original | |
| or "ē" in original | |
| or "ī" in original | |
| or "ō" in original | |
| or "ū" in original | |
| ): | |
| tone = 1 | |
| elif "h" in original: | |
| tone = 4 | |
| else: | |
| tone = 3 | |
| # Remove diacritics | |
| yale = "".join( | |
| c | |
| for c in unicodedata.normalize("NFD", yale) | |
| if unicodedata.category(c) != "Mn" | |
| ) | |
| # Append tone | |
| return yale + str(tone) | |
| def convert_file(input_path, output_path): | |
| with open(input_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| # Find all **...** and convert the content | |
| def replace_match(match): | |
| yale = match.group(1) | |
| jyutping = convert_yale_to_jyutping(yale) | |
| return f"**{jyutping}**" | |
| # Regex to match **...** | |
| pattern = r"\*\*(.*?)\*\*" | |
| new_content = re.sub(pattern, replace_match, content) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(new_content) | |
| if __name__ == "__main__": | |
| input_file = "/Users/josephcheng/Projects/wordshk_conversations/Dictionary of Cantonese Slang_extracted.md" | |
| output_file = ( | |
| "/Users/josephcheng/Projects/wordshk_conversations/Dictionary_converted.md" | |
| ) | |
| convert_file(input_file, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment