Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active April 6, 2026 07:41
Show Gist options
  • Select an option

  • Save 7shi/d43c0fb980bc89aac30286254969252c to your computer and use it in GitHub Desktop.

Select an option

Save 7shi/d43c0fb980bc89aac30286254969252c to your computer and use it in GitHub Desktop.
[py] Bidirectional conversion between Kannada script and IAST romanization.
"""
Bidirectional conversion between Kannada script and IAST romanization.
"""
VIRAMA = '\u0ccd'
V_AA = "\u0cbe"
V_I = "\u0cbf"
V_II = "\u0cc0"
V_U = "\u0cc1"
V_UU = "\u0cc2"
V_R = "\u0cc3"
V_RR = "\u0cc4"
V_E = "\u0cc6"
V_EE = "\u0cc7"
V_AI = "\u0cc8"
V_O = "\u0cca"
V_OO = "\u0ccb"
V_AU = "\u0ccc"
# Vowels: (independent form, dependent/matra form)
VOWELS = {
'a': ('ಅ', ''),
'ā': ('ಆ', V_AA),
'i': ('ಇ', V_I),
'ī': ('ಈ', V_II),
'u': ('ಉ', V_U),
'ū': ('ಊ', V_UU),
'r̥': ('ಋ', V_R),
'r̥̄': ('ೠ', V_RR),
'l̥': ('ಌ', ''),
'l̥̄': ('ೡ', ''),
'e': ('ಎ', V_E),
'ē': ('ಏ', V_EE),
'ai': ('ಐ', V_AI),
'o': ('ಒ', V_O),
'ō': ('ಓ', V_OO),
'au': ('ಔ', V_AU),
}
# Consonants
CONSONANTS = {
'k' : 'ಕ', 'kh': 'ಖ', 'g' : 'ಗ', 'gh': 'ಘ', 'ṅ' : 'ಙ',
'c' : 'ಚ', 'ch': 'ಛ', 'j' : 'ಜ', 'jh': 'ಝ', 'ñ' : 'ಞ',
'ṭ' : 'ಟ', 'ṭh': 'ಠ', 'ḍ' : 'ಡ', 'ḍh': 'ಢ', 'ṇ' : 'ಣ',
't' : 'ತ', 'th': 'ಥ', 'd' : 'ದ', 'dh': 'ಧ', 'n' : 'ನ',
'p' : 'ಪ', 'ph': 'ಫ', 'b' : 'ಬ', 'bh': 'ಭ', 'm' : 'ಮ',
'y' : 'ಯ', 'r' : 'ರ', 'l' : 'ಲ', 'v' : 'ವ',
'ś' : 'ಶ', 'ṣ' : 'ಷ', 's' : 'ಸ', 'h' : 'ಹ',
'ḷ' : 'ಳ', 'ṟ' : 'ಱ', 'ḻ' : 'ೞ',
}
# Other marks
OTHER_MARKS = {
'ṃ': '\u0c82', # ಂ (anusvara)
'ḥ': '\u0c83', # ಃ (visarga)
}
# Digits
DIGITS = {
'0': '೦', '1': '೧', '2': '೨', '3': '೩', '4': '೪',
'5': '೫', '6': '೬', '7': '೭', '8': '೮', '9': '೯',
}
# Pre-extract and sort keys with length > 1
MULTI_CHAR_KEYS = sorted(
[k for k in set(list(VOWELS.keys()) + list(CONSONANTS.keys()) + list(OTHER_MARKS.keys())) if len(k) > 1],
key=len, reverse=True
)
# Reverse lookup tables
KANNADA_TO_VOWEL = {v[0]: k for k, v in VOWELS.items()}
KANNADA_TO_DEPENDENT_VOWEL = {v[1]: k for k, v in VOWELS.items() if v[1]}
KANNADA_TO_CONSONANT = {v: k for k, v in CONSONANTS.items()}
KANNADA_TO_OTHER = {v: k for k, v in OTHER_MARKS.items()}
KANNADA_TO_DIGIT = {v: k for k, v in DIGITS.items()}
def tokenize(text):
"""Split IAST romanized text into a list of tokens.
Multi-character keys (e.g. 'kh', 'ai') are matched greedily before
single characters, so that digraphs are never split.
Args:
text: IAST romanized string.
Returns:
List of token strings.
"""
tokens = []
i = 0
while i < len(text):
matched = False
for key in MULTI_CHAR_KEYS:
if text[i:i+len(key)] == key:
tokens.append(key)
i += len(key)
matched = True
break
if not matched:
tokens.append(text[i])
i += 1
return tokens
def romanize_to_kannada(text):
"""Convert IAST romanized text to Kannada script.
Handles consonant clusters (virama insertion), vowel matras, anusvara,
visarga, and Kannada digits.
Args:
text: IAST romanized string (e.g. ``"kannaḍa"``).
Returns:
Kannada script string (e.g. ``"ಕನ್ನಡ"``).
"""
tokens = tokenize(text)
result = []
i = 0
while i < len(tokens):
token = tokens[i]
if token in DIGITS:
result.append(DIGITS[token])
i += 1
continue
if token in OTHER_MARKS:
result.append(OTHER_MARKS[token])
i += 1
continue
if token in CONSONANTS:
result.append(CONSONANTS[token])
i += 1
if i < len(tokens):
next_token = tokens[i]
if next_token == 'a':
if i + 1 < len(tokens):
combined = next_token + tokens[i + 1]
if combined in VOWELS and combined != 'a':
dependent = VOWELS[combined][1]
if dependent:
result.append(dependent)
i += 2
continue
i += 1
continue
if next_token in VOWELS:
dependent = VOWELS[next_token][1]
if dependent:
result.append(dependent)
i += 1
continue
result.append(VIRAMA)
continue
if token in VOWELS:
result.append(VOWELS[token][0])
i += 1
continue
result.append(token)
i += 1
return ''.join(result)
def kannada_to_romanize(text):
"""Convert Kannada script to IAST romanized text.
Virama is consumed silently (no vowel appended). Dependent vowel signs
(matras) are mapped back to their IAST equivalents. Bare consonants
(not followed by virama or matra) are given an implicit ``'a'``.
Args:
text: Kannada script string (e.g. ``"ಕನ್ನಡ"``).
Returns:
IAST romanized string (e.g. ``"kannaḍa"``).
"""
result = []
i = 0
while i < len(text):
char = text[i]
if char in KANNADA_TO_DIGIT:
result.append(KANNADA_TO_DIGIT[char])
i += 1
continue
if char in KANNADA_TO_OTHER:
result.append(KANNADA_TO_OTHER[char])
i += 1
continue
if char in KANNADA_TO_CONSONANT:
result.append(KANNADA_TO_CONSONANT[char])
i += 1
if i < len(text):
next_char = text[i]
if next_char == VIRAMA:
i += 1
elif next_char in KANNADA_TO_DEPENDENT_VOWEL:
result.append(KANNADA_TO_DEPENDENT_VOWEL[next_char])
i += 1
else:
result.append('a')
else:
result.append('a')
continue
if char in KANNADA_TO_VOWEL:
result.append(KANNADA_TO_VOWEL[char])
i += 1
continue
result.append(char)
i += 1
return ''.join(result)
def main():
"""Entry point for the command-line interface.
Args:
None (reads from sys.argv via argparse).
Returns:
None
"""
import argparse
parser = argparse.ArgumentParser(description='Convert between Kannada script and IAST romanization.')
parser.add_argument('words', nargs='+', help='Input words to convert.')
parser.add_argument('-k', '--kannada', action='store_true',
help='Convert IAST romanization to Kannada script (default: Kannada to IAST).')
args = parser.parse_args()
text = ' '.join(args.words)
if args.kannada:
print(romanize_to_kannada(text))
else:
print(kannada_to_romanize(text))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment