Last active
April 6, 2026 07:41
-
-
Save 7shi/d43c0fb980bc89aac30286254969252c to your computer and use it in GitHub Desktop.
[py] Bidirectional conversion between Kannada script and IAST romanization.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Bidirectional conversion between Kannada script and IAST romanization. | |
| """ | |
| VIRAMA = '\u0ccd' | |
| V_AA = "\u0cbe" | |
| V_I = "\u0cbf" | |
| V_II = "\u0cc0" | |
| V_U = "\u0cc1" | |
| V_UU = "\u0cc2" | |
| V_R = "\u0cc3" | |
| V_RR = "\u0cc4" | |
| V_E = "\u0cc6" | |
| V_EE = "\u0cc7" | |
| V_AI = "\u0cc8" | |
| V_O = "\u0cca" | |
| V_OO = "\u0ccb" | |
| V_AU = "\u0ccc" | |
| # Vowels: (independent form, dependent/matra form) | |
| VOWELS = { | |
| 'a': ('ಅ', ''), | |
| 'ā': ('ಆ', V_AA), | |
| 'i': ('ಇ', V_I), | |
| 'ī': ('ಈ', V_II), | |
| 'u': ('ಉ', V_U), | |
| 'ū': ('ಊ', V_UU), | |
| 'r̥': ('ಋ', V_R), | |
| 'r̥̄': ('ೠ', V_RR), | |
| 'l̥': ('ಌ', ''), | |
| 'l̥̄': ('ೡ', ''), | |
| 'e': ('ಎ', V_E), | |
| 'ē': ('ಏ', V_EE), | |
| 'ai': ('ಐ', V_AI), | |
| 'o': ('ಒ', V_O), | |
| 'ō': ('ಓ', V_OO), | |
| 'au': ('ಔ', V_AU), | |
| } | |
| # Consonants | |
| CONSONANTS = { | |
| 'k' : 'ಕ', 'kh': 'ಖ', 'g' : 'ಗ', 'gh': 'ಘ', 'ṅ' : 'ಙ', | |
| 'c' : 'ಚ', 'ch': 'ಛ', 'j' : 'ಜ', 'jh': 'ಝ', 'ñ' : 'ಞ', | |
| 'ṭ' : 'ಟ', 'ṭh': 'ಠ', 'ḍ' : 'ಡ', 'ḍh': 'ಢ', 'ṇ' : 'ಣ', | |
| 't' : 'ತ', 'th': 'ಥ', 'd' : 'ದ', 'dh': 'ಧ', 'n' : 'ನ', | |
| 'p' : 'ಪ', 'ph': 'ಫ', 'b' : 'ಬ', 'bh': 'ಭ', 'm' : 'ಮ', | |
| 'y' : 'ಯ', 'r' : 'ರ', 'l' : 'ಲ', 'v' : 'ವ', | |
| 'ś' : 'ಶ', 'ṣ' : 'ಷ', 's' : 'ಸ', 'h' : 'ಹ', | |
| 'ḷ' : 'ಳ', 'ṟ' : 'ಱ', 'ḻ' : 'ೞ', | |
| } | |
| # Other marks | |
| OTHER_MARKS = { | |
| 'ṃ': '\u0c82', # ಂ (anusvara) | |
| 'ḥ': '\u0c83', # ಃ (visarga) | |
| } | |
| # Digits | |
| DIGITS = { | |
| '0': '೦', '1': '೧', '2': '೨', '3': '೩', '4': '೪', | |
| '5': '೫', '6': '೬', '7': '೭', '8': '೮', '9': '೯', | |
| } | |
| # Pre-extract and sort keys with length > 1 | |
| MULTI_CHAR_KEYS = sorted( | |
| [k for k in set(list(VOWELS.keys()) + list(CONSONANTS.keys()) + list(OTHER_MARKS.keys())) if len(k) > 1], | |
| key=len, reverse=True | |
| ) | |
| # Reverse lookup tables | |
| KANNADA_TO_VOWEL = {v[0]: k for k, v in VOWELS.items()} | |
| KANNADA_TO_DEPENDENT_VOWEL = {v[1]: k for k, v in VOWELS.items() if v[1]} | |
| KANNADA_TO_CONSONANT = {v: k for k, v in CONSONANTS.items()} | |
| KANNADA_TO_OTHER = {v: k for k, v in OTHER_MARKS.items()} | |
| KANNADA_TO_DIGIT = {v: k for k, v in DIGITS.items()} | |
| def tokenize(text): | |
| """Split IAST romanized text into a list of tokens. | |
| Multi-character keys (e.g. 'kh', 'ai') are matched greedily before | |
| single characters, so that digraphs are never split. | |
| Args: | |
| text: IAST romanized string. | |
| Returns: | |
| List of token strings. | |
| """ | |
| tokens = [] | |
| i = 0 | |
| while i < len(text): | |
| matched = False | |
| for key in MULTI_CHAR_KEYS: | |
| if text[i:i+len(key)] == key: | |
| tokens.append(key) | |
| i += len(key) | |
| matched = True | |
| break | |
| if not matched: | |
| tokens.append(text[i]) | |
| i += 1 | |
| return tokens | |
| def romanize_to_kannada(text): | |
| """Convert IAST romanized text to Kannada script. | |
| Handles consonant clusters (virama insertion), vowel matras, anusvara, | |
| visarga, and Kannada digits. | |
| Args: | |
| text: IAST romanized string (e.g. ``"kannaḍa"``). | |
| Returns: | |
| Kannada script string (e.g. ``"ಕನ್ನಡ"``). | |
| """ | |
| tokens = tokenize(text) | |
| result = [] | |
| i = 0 | |
| while i < len(tokens): | |
| token = tokens[i] | |
| if token in DIGITS: | |
| result.append(DIGITS[token]) | |
| i += 1 | |
| continue | |
| if token in OTHER_MARKS: | |
| result.append(OTHER_MARKS[token]) | |
| i += 1 | |
| continue | |
| if token in CONSONANTS: | |
| result.append(CONSONANTS[token]) | |
| i += 1 | |
| if i < len(tokens): | |
| next_token = tokens[i] | |
| if next_token == 'a': | |
| if i + 1 < len(tokens): | |
| combined = next_token + tokens[i + 1] | |
| if combined in VOWELS and combined != 'a': | |
| dependent = VOWELS[combined][1] | |
| if dependent: | |
| result.append(dependent) | |
| i += 2 | |
| continue | |
| i += 1 | |
| continue | |
| if next_token in VOWELS: | |
| dependent = VOWELS[next_token][1] | |
| if dependent: | |
| result.append(dependent) | |
| i += 1 | |
| continue | |
| result.append(VIRAMA) | |
| continue | |
| if token in VOWELS: | |
| result.append(VOWELS[token][0]) | |
| i += 1 | |
| continue | |
| result.append(token) | |
| i += 1 | |
| return ''.join(result) | |
| def kannada_to_romanize(text): | |
| """Convert Kannada script to IAST romanized text. | |
| Virama is consumed silently (no vowel appended). Dependent vowel signs | |
| (matras) are mapped back to their IAST equivalents. Bare consonants | |
| (not followed by virama or matra) are given an implicit ``'a'``. | |
| Args: | |
| text: Kannada script string (e.g. ``"ಕನ್ನಡ"``). | |
| Returns: | |
| IAST romanized string (e.g. ``"kannaḍa"``). | |
| """ | |
| result = [] | |
| i = 0 | |
| while i < len(text): | |
| char = text[i] | |
| if char in KANNADA_TO_DIGIT: | |
| result.append(KANNADA_TO_DIGIT[char]) | |
| i += 1 | |
| continue | |
| if char in KANNADA_TO_OTHER: | |
| result.append(KANNADA_TO_OTHER[char]) | |
| i += 1 | |
| continue | |
| if char in KANNADA_TO_CONSONANT: | |
| result.append(KANNADA_TO_CONSONANT[char]) | |
| i += 1 | |
| if i < len(text): | |
| next_char = text[i] | |
| if next_char == VIRAMA: | |
| i += 1 | |
| elif next_char in KANNADA_TO_DEPENDENT_VOWEL: | |
| result.append(KANNADA_TO_DEPENDENT_VOWEL[next_char]) | |
| i += 1 | |
| else: | |
| result.append('a') | |
| else: | |
| result.append('a') | |
| continue | |
| if char in KANNADA_TO_VOWEL: | |
| result.append(KANNADA_TO_VOWEL[char]) | |
| i += 1 | |
| continue | |
| result.append(char) | |
| i += 1 | |
| return ''.join(result) | |
| def main(): | |
| """Entry point for the command-line interface. | |
| Args: | |
| None (reads from sys.argv via argparse). | |
| Returns: | |
| None | |
| """ | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Convert between Kannada script and IAST romanization.') | |
| parser.add_argument('words', nargs='+', help='Input words to convert.') | |
| parser.add_argument('-k', '--kannada', action='store_true', | |
| help='Convert IAST romanization to Kannada script (default: Kannada to IAST).') | |
| args = parser.parse_args() | |
| text = ' '.join(args.words) | |
| if args.kannada: | |
| print(romanize_to_kannada(text)) | |
| else: | |
| print(kannada_to_romanize(text)) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment