Skip to content

Instantly share code, notes, and snippets.

@hrishikeshrt
Last active September 20, 2024 12:56
Show Gist options
  • Save hrishikeshrt/130c0b494a3370c0cfd0e011a8c3a082 to your computer and use it in GitHub Desktop.
Save hrishikeshrt/130c0b494a3370c0cfd0e011a8c3a082 to your computer and use it in GitHub Desktop.
Sanskrit Utility Functions
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 22:20:39 2018
Updated on Sun Jul 03 21:39:43 2022
@author: Hrishikesh Terdalkar
"""
import re
import logging
from collections import defaultdict
from itertools import product
logger = logging.getLogger(__name__)
###############################################################################
def ord_unicode(ch):
return hex(ord(ch)).split('x')[1].zfill(4)
def chr_unicode(u):
return chr(int(u, 16))
###############################################################################
# Alphabet of Sanskrit
MATRA = ['ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॢ', 'ॣ', 'े', 'ै', 'ो', 'ौ']
SWARA = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ऌ', 'ॡ', 'ए', 'ऐ', 'ओ', 'औ']
KANTHYA = ['क', 'ख', 'ग', 'घ', 'ङ']
TALAVYA = ['च', 'छ', 'ज', 'झ', 'ञ']
MURDHANYA = ['ट', 'ठ', 'ड', 'ढ', 'ण']
DANTYA = ['त', 'थ', 'द', 'ध', 'न']
AUSHTHYA = ['प', 'फ', 'ब', 'भ', 'म']
ANTAHSTHA = ['य', 'र', 'ल', 'व']
USHMA = ['श', 'ष', 'स', 'ह']
VISHISHTA = ['ळ']
VARGIYA = KANTHYA + TALAVYA + MURDHANYA + DANTYA + AUSHTHYA
VYANJANA = VARGIYA + ANTAHSTHA + USHMA + VISHISHTA
VARGA_PRATHAMA = [VARGIYA[i * 5] for i in range(5)]
VARGA_DWITIYA = [VARGIYA[i * 5 + 1] for i in range(5)]
VARGA_TRITIYA = [VARGIYA[i * 5 + 2] for i in range(5)]
VARGA_CHATURTHA = [VARGIYA[i * 5 + 3] for i in range(5)]
VARGA_PANCHAMA = [VARGIYA[i * 5 + 4] for i in range(5)]
LAGHU_SWARA = [SWARA[i] for i in [0, 2, 4, 6, 8]]
LAGHU_MATRA = [MATRA[i] for i in [1, 3, 5, 7]]
OM = 'ॐ'
AVAGRAHA = 'ऽ'
SWARITA = '॑'
DOUBLE_SWARITA = '᳚'
TRIPLE_SWARITA = '᳛'
ANUDATTA = '॒'
CHANDRABINDU = 'ँ'
CHANDRABINDU_VIRAMA = 'ꣳ'
CHANDRABINDU_SPACING = 'ꣲ'
ANUSWARA = 'ं'
VISARGA = 'ः'
ARDHAVISARGA = 'ᳲ'
JIHVAAMULIYA = 'ᳵ'
UPADHMANIYA = 'ᳶ'
HALANTA = '्'
NUKTA = '़'
ABBREV = '॰'
DANDA = '।'
DOUBLE_DANDA = '॥'
EXTRA_MATRA = [CHANDRABINDU, ANUSWARA, VISARGA]
AYOGAVAAHA = EXTRA_MATRA + [JIHVAAMULIYA, UPADHMANIYA]
VEDIC_MARKS = [SWARITA, ANUDATTA, DOUBLE_SWARITA, TRIPLE_SWARITA]
SPECIAL = [AVAGRAHA, OM, NUKTA, CHANDRABINDU_VIRAMA, CHANDRABINDU_SPACING]
OTHER = [HALANTA]
VARNA = SWARA + VYANJANA
ALPHABET = VARNA + MATRA + AYOGAVAAHA + SPECIAL + OTHER + VEDIC_MARKS
SPACES = [' ', '\t', '\n', '\r']
PUNC = [DANDA, DOUBLE_DANDA, ABBREV]
GEN_PUNC = ['.', ',', ';', '', '"', "'", '`']
DIGITS = ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
COMBINING_DIGIT_MARKS = ['꣠', '꣡', '꣢', '꣣', '꣤', '꣥', '꣦', '꣧', '꣨', '꣩']
KSHA = 'क्ष'
JNA = 'ज्ञ'
###############################################################################
HOW_TO_WRITE = """
Unicode characters chan be typed directly from the keyboard as follows,
[Ctrl+Shift+u] [4-digit-unicode-identifier] [space]
Some of the characters can also be typed using m17n-sanskrit-itrans keyboard
(Package: https://launchpad.net/ubuntu/+source/ibus-m17n)
(File: /usr/share/m17n/sa-itrans.mim)
Notable Unicodes and Shortcuts
---
1cf2 for Ardhavisarga
1cf5 for Jihvamuliya -- kH
1cf6 for Upadhmaniya -- pH
0951 for Swarita -- ''
0952 for Anudatta -- _
0901 for Chandrabindu -- .N
a8f2 for (stand-alone) Chandrabindu Spacing
093d for Avagraha -- .a
094d for Halanta -- .h
0950 for Om -- OM
a8e0 to a8e9 for Combining Devanagari Digits 0-9 (Swara Marks for Samaveda)
"""
###############################################################################
MAAHESHWARA_SUTRA = [
['अ', 'इ', 'उ', 'ण्'],
['ऋ', 'ऌ', 'क्'],
['ए', 'ओ', 'ङ्'],
['ऐ', 'औ', 'च्'],
['ह', 'य', 'व', 'र', 'ट्'],
['ल', 'ण्'],
['ञ', 'म', 'ङ', 'ण', 'न', 'म्'],
['झ', 'भ', 'ञ्'],
['घ', 'ढ', 'ध', 'ष्'],
['ज', 'ब', 'ग', 'ड', 'द', 'श्'],
['ख', 'फ', 'छ', 'ठ', 'थ', 'च', 'ट', 'त', 'व्'],
['क', 'प', 'य्'],
['श', 'ष', 'स', 'र्'],
['ह', 'ल्']
]
# --------------------------------------------------------------------------- #
MAAHESHWARA_KRAMA = [varna for sutra in MAAHESHWARA_SUTRA for varna in sutra]
# --------------------------------------------------------------------------- #
MAAHESHWARA_IDX = defaultdict(list)
idx = 0
for _sutra_idx, sutra in enumerate(MAAHESHWARA_SUTRA):
for _internal_idx, varna in enumerate(sutra):
if HALANTA in varna:
_idx = -1
else:
_idx = idx
idx += 1
MAAHESHWARA_IDX[varna].append((_sutra_idx, _internal_idx, _idx))
###############################################################################
def form_pratyaahaara(letters):
"""Form a pratyaahaara from a list of letters"""
varna_idx = []
ignored = []
for varna in letters:
if varna in MAAHESHWARA_IDX and HALANTA not in varna:
varna_idx.append(MAAHESHWARA_IDX[varna])
else:
ignored.append(varna)
if ignored:
logger.info(f"Ignored letters: {ignored}")
varna_idxs = product(*varna_idx)
for v_idx in varna_idxs:
v_idx = sorted(v_idx, key=lambda x: x[2])
_v_idx = [w[2] for w in v_idx]
if _v_idx != list(range(_v_idx[0], _v_idx[-1] + 1)):
continue
else:
break
else:
logger.warning("Cannot form a pratyaahara due to discontinuity.")
return None
_aadi_idx = v_idx[0]
_pre_antya_idx = v_idx[-1]
if _pre_antya_idx[1] != len(MAAHESHWARA_SUTRA[_pre_antya_idx[0]]) - 2:
logger.warning("Cannot form a pratyaahara due to end position.")
return None
aadi = MAAHESHWARA_SUTRA[_aadi_idx[0]][_aadi_idx[1]]
antya = MAAHESHWARA_SUTRA[_pre_antya_idx[0]][-1]
return f'{aadi}{antya}'
def resolve_pratyaahaara(pratyaahaara):
"""Resolve pratyaahaara"""
aadi = pratyaahaara[0]
antya = pratyaahaara[1:]
possible_starts = []
possible_ends = []
for idx, varna in enumerate(MAAHESHWARA_KRAMA):
if varna == aadi:
possible_starts.append(idx)
if varna == antya:
possible_ends.append(idx)
resolutions = [
[MAAHESHWARA_KRAMA[idx]
for idx in range(start, end)
if HALANTA not in MAAHESHWARA_KRAMA[idx]]
for start in possible_starts
for end in possible_ends
if start < end
]
return resolutions
###############################################################################
def clean(text, punct=False, digits=False, spaces=True, allow=[]):
"""
Clean a line of samskRta text
- punct: False (True means punctuations are kept)
- digits: False (True means digits are kept)
- spaces: True (we usually don't want to change this)
- allow: list of characters to allow
"""
alphabet = ALPHABET + allow
if spaces:
alphabet += SPACES
if punct:
alphabet += PUNC + GEN_PUNC
if digits:
alphabet += DIGITS
answer = ''.join(['' if c not in alphabet else c for c in text])
answer = '\n'.join([' '.join(line.split())
for line in answer.split('\n') if line.strip()])
return answer
def split_lines(text, pattern=r'[।॥\r\n]+'):
return list(filter(None, re.split(pattern, text)))
###############################################################################
def trim_matra(line):
answer = line
if line[-1] in [ANUSWARA, HALANTA, VISARGA]:
answer = line[:-1]
if answer[-1] in MATRA:
answer = answer[:-1]
return answer
###############################################################################
def is_laghu(syllable):
"""
Checks if the current syllable is Laghu
"""
return all([(x in VYANJANA or
x in LAGHU_SWARA or
x in LAGHU_MATRA or
x == HALANTA) for x in syllable])
def toggle_matra(syllable):
"""
Change the Laghu syllable to Guru and Guru to Laghu (if possible)
"""
if syllable[-1] in MATRA:
index = MATRA.index(syllable[-1])
if index in [2, 4, 6, 8]:
return syllable[:-1] + MATRA[index-1]
if index in [1, 3, 5, 7]:
return syllable[:-1] + MATRA[index+1]
if syllable in SWARA:
index = SWARA.index(syllable)
if index in [0, 2, 4, 6, 8]:
return SWARA[index + 1]
if index in [1, 3, 5, 7, 9]:
return SWARA[index - 1]
###############################################################################
def maatra_to_swara(m):
"""Convert the Matra to corresponding Swara"""
if m == f'-{SWARA[0]}':
return SWARA[0]
try:
m_idx = MATRA.index(m)
except Exception:
return None
return SWARA[m_idx + 1]
def swara_to_maatra(s):
"""Convert a Swara to correponding Matra"""
if s == SWARA[0]:
return f'-{s}'
try:
s_idx = SWARA.index(s)
except Exception:
return None
return MATRA[s_idx - 1]
###############################################################################
def get_anunaasika(ch):
"""Get appropriate anunasik from the character's group"""
MA = AUSHTHYA[4]
if ch == '':
return MA
if ch in VYANJANA:
i = VYANJANA.index(ch)
if i < 25:
return VYANJANA[int(i/5) * 5 + 4]
else:
return ANUSWARA
else:
return ANUSWARA
def fix_anuswara(text):
output_chars = []
if text:
for idx in range(len(text) - 1):
char = text[idx]
next_char = text[idx + 1]
if char == ANUSWARA and next_char in VARGIYA:
anunasika = get_anunaasika(next_char)
output_chars.append(anunasika)
output_chars.append(HALANTA)
else:
output_chars.append(char)
output_chars.append(text[-1])
return ''.join(output_chars)
###############################################################################
def get_syllables_word(word, technical=False):
"""
Get syllables from a Sanskrit word
@params:
word: word to get syllables from
technical: (boolean)
if True, ensures that each element contains at most
one Swara or Vyanjana
"""
word = clean(word, spaces=False)
wlen = len(word)
word_syllables = []
current = ''
i = 0
while i < wlen:
curr_ch = word[i]
current += curr_ch
i += 1
# words split to start at START_CHARS
start_chars = VARNA + SPECIAL
if technical:
start_chars += EXTRA_MATRA
while i < wlen and word[i] not in start_chars:
current += word[i]
i += 1
if current[-1] != HALANTA or i == wlen or technical:
word_syllables.append(current)
current = ''
return word_syllables
def get_syllables(text, technical=False):
"""
Get syllables from a Sanskrit text
@params:
word: word to get syllables from
technical: (boolean)
if True, ensures that each element contains at most
one Swara or Vyanjana
"""
lines = split_lines(text.strip())
syllables = []
for line in lines:
words = line.split()
line_syllables = []
for word in words:
word_syllables = get_syllables_word(word, technical)
line_syllables.append(word_syllables)
syllables.append(line_syllables)
return syllables
###############################################################################
def split_varna_word(word, technical=True):
"""
Give a Varna decomposition of a Sanskrit word
@params:
word: word to be split
technical: (boolean)
if True would give split more useful for analysis
@return:
viccheda: list of list of lists
Viccheda of each word is a list.
- List of Viccheda of each word from a line
- List of Viccheda of each line from the text
"""
word_syllables = get_syllables_word(word, True)
word_viccheda = []
for syllable in word_syllables:
if syllable[0] in SWARA:
word_viccheda.append(syllable[0])
if len(syllable) > 1:
word_viccheda.append(syllable[1])
# TODO: Will this ever be the case?
if len(syllable) > 2:
logger.debug(f"Long SWARA: {syllable}")
word_viccheda.append(syllable[2:])
elif syllable[0] in VYANJANA:
word_viccheda.append(syllable[0] + HALANTA)
if len(syllable) == 1:
word_viccheda.append('-' + SWARA[0])
if len(syllable) > 1:
if syllable[1] in EXTRA_MATRA:
word_viccheda.append('-' + SWARA[0])
if syllable[1] != HALANTA:
word_viccheda.append(syllable[1])
# TODO: Will this ever be the case?
if len(syllable) > 2:
logger.debug(f"Long VYANJANA: {syllable}")
word_viccheda.append(syllable[2:])
else:
word_viccheda.append(syllable)
if not technical:
real_word_viccheda = []
for varna in word_viccheda:
if varna in MATRA:
m_idx = MATRA.index(varna)
real_word_viccheda.append(SWARA[m_idx + 1])
elif varna == f'-{SWARA[0]}':
real_word_viccheda.append(varna[1])
elif varna in EXTRA_MATRA:
real_word_viccheda[-1] += varna
else:
real_word_viccheda.append(varna)
word_viccheda = real_word_viccheda
return word_viccheda
def split_varna(text, technical=True, flat=False):
"""
Give a Varna decomposition of a Sanskrit text
@params:
text: text to be split
technical: (boolean)
if True would give split more useful for analysis
flat: (boolean)
If True,
return a single list instead of nested lists
words will be separated by a space, lines by a newline char
The default is False
@return:
viccheda: list of list of lists
Viccheda of each word is a list.
- List of Viccheda of each word from a line
- List of Viccheda of each line from the text
"""
lines = split_lines(text.strip())
viccheda = []
num_lines = len(lines)
for line_idx, line in enumerate(lines):
words = line.split()
line_viccheda = []
num_words = len(words)
for word_idx, word in enumerate(words):
word_viccheda = split_varna_word(word, technical)
if flat:
line_viccheda.extend(word_viccheda)
if word_idx != num_words - 1:
line_viccheda.append(' ')
else:
line_viccheda.append(word_viccheda)
if flat:
viccheda.extend(line_viccheda)
if line_idx != num_lines - 1:
viccheda.append('\n')
else:
viccheda.append(line_viccheda)
return viccheda
def join_varna(viccheda, technical=True):
"""
Join Varna decomposition to form a Sanskrit word
Parameters
----------
viccheda : list
Viccheda output obtained by split_varna_word
(or output of split_varna with flat=True)
technical : bool
Value of the same parameter passed to split_varna_word
Returns
-------
s : str
Sanskrit word
"""
word = []
i = 0
while i < len(viccheda):
curr_syl = viccheda[i]
next_syl = ''
if i < len(viccheda) - 1:
next_syl = viccheda[i+1]
i += 1
if curr_syl in [' ', '\n']:
word.append(curr_syl)
continue
if curr_syl[0] in SWARA + SPECIAL:
word.append(curr_syl[0])
if curr_syl[-1] in EXTRA_MATRA:
word.append(curr_syl[-1])
if curr_syl[-1] == HALANTA:
if next_syl in [' ', '\n']:
word.append(curr_syl)
continue
if next_syl == '':
word.append(curr_syl)
break
if next_syl[-1] == HALANTA:
word.append(curr_syl)
if next_syl[0] in SWARA:
i += 1
word.append(curr_syl[:-1])
if next_syl[0] != SWARA[0]:
s_idx = SWARA.index(next_syl[0])
matra = MATRA[s_idx - 1]
word.append(matra)
if next_syl[-1] == VISARGA:
word.append(next_syl[-1])
if next_syl in EXTRA_MATRA:
i += 1
word.append(curr_syl[:-1] + next_syl)
if next_syl in MATRA + ['-अ']:
i += 1
word.append(curr_syl[:-1])
if next_syl != '-अ':
word.append(next_syl)
if curr_syl in MATRA + ['-अ'] + EXTRA_MATRA:
word.append(curr_syl)
return ''.join(word)
###############################################################################
###############################################################################
# Ucchaarana Sthaana Module
# ------------------------
STHAANA = {
'S_K': ['अ', 'आ'] + KANTHYA + ['ह'] + [VISARGA],
'S_T': ['इ', 'ई'] + TALAVYA + ['य', 'श'],
'S_M': ['ऋ', 'ॠ'] + MURDHANYA + ['र', 'ष'],
'S_D': ['ऌ', 'ॡ'] + DANTYA + ['ल', 'स'],
'S_O': ['उ', 'ऊ'] + AUSHTHYA + [UPADHMANIYA],
'S_N': VARGA_PANCHAMA + [ANUSWARA],
'S_KT': ['ए', 'ऐ'],
'S_KO': ['ओ', 'औ'],
'S_DO': ['व'],
'S_JM': [JIHVAAMULIYA]
}
STHAANA_NAMES = {
'S_K': 'कण्ठः',
'S_T': 'तालु',
'S_M': 'मूर्धा',
'S_D': 'दन्ताः',
'S_O': 'ओष्ठौ',
'S_N': 'नासिका',
'S_KT': 'कण्ठतालु',
'S_KO': 'कण्ठौष्ठम्',
'S_DO': 'दन्तौष्ठम्',
'S_JM': 'जिह्वामूलम्'
}
###############################################################################
AABHYANTARA = {
'A_SP': VARGIYA,
'A_ISP': ANTAHSTHA,
'A_IVVT': USHMA + [JIHVAAMULIYA, UPADHMANIYA],
'A_VVT': SWARA[1:] + [CHANDRABINDU, ANUSWARA, VISARGA],
'A_SVT': SWARA[:1]
}
AABHYANTARA_NAMES = {
'A_SP': 'स्पृष्टः',
'A_ISP': 'ईषत्स्पृष्टः',
'A_IVVT': 'ईषद्विवृतः',
'A_VVT': 'विवृतः',
'A_SVT': 'संवृतः'
}
###############################################################################
BAAHYA = {
'B_VVR': resolve_pratyaahaara('खर्')[0],
'B_SVR': resolve_pratyaahaara('हश्')[0] + SWARA,
'B_SW': resolve_pratyaahaara('खर्')[0],
'B_ND': resolve_pratyaahaara('हश्')[0] + SWARA,
'B_GH': resolve_pratyaahaara('हश्')[0] + SWARA,
'B_AGH': resolve_pratyaahaara('खर्')[0],
'B_AP': (
VARGA_PRATHAMA + VARGA_TRITIYA + VARGA_PANCHAMA +
resolve_pratyaahaara('यण्')[0]
) + SWARA + [CHANDRABINDU, ANUSWARA],
'B_MP': (
VARGA_DWITIYA + VARGA_CHATURTHA +
resolve_pratyaahaara('शल्')[0]
) + [VISARGA, JIHVAAMULIYA, UPADHMANIYA],
'B_U': SWARA,
'B_ANU': [s + ANUDATTA for s in SWARA],
'B_SWA': [s + SWARITA for s in SWARA]
}
BAAHYA_NAMES = {
'B_VVR': 'विवारः',
'B_SVR': 'संवारः',
'B_SW': 'श्वासः',
'B_ND': 'नादः',
'B_GH': 'घोषः',
'B_AGH': 'अघोषः',
'B_AP': 'अल्पप्राणः',
'B_MP': 'महाप्राणः',
'B_U': 'उदात्तः',
'B_ANU': 'अनुदात्तः',
'B_SWA': 'स्वरितः'
}
###############################################################################
UCCHAARANA = dict(**STHAANA, **AABHYANTARA, **BAAHYA)
UCCHAARANA_NAMES = dict(**STHAANA_NAMES, **AABHYANTARA_NAMES, **BAAHYA_NAMES)
###############################################################################
def get_ucchaarana_vector(letter, abbrev=False):
"""
Get ucchaarana sthaana and prayatna based vector of a letter
Parameters
----------
letter : str
Sanskrit letter
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
vector : dict
one-hot vector indicating utpatti sthaana, aabhyantara prayatna and
baahya prayatna of a letter
"""
varna = letter.replace(HALANTA, '') if letter.endswith(HALANTA) else letter
if abbrev:
def ucchaarana_name(s):
return s
else:
def ucchaarana_name(s):
return UCCHAARANA_NAMES[s]
vector = {ucchaarana_name(k): 0 for k in UCCHAARANA}
for s, varna_list in UCCHAARANA.items():
if varna in varna_list:
vector[ucchaarana_name(s)] = 1
return vector
def get_ucchaarana_vectors(word, abbrev=False):
"""
Get ucchaarana sthaana and prayatna based vector of a word or text
Parameters
----------
word : str
Sanskrit word (or text)
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
vectors : list
List of (letter, vector)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in EXTRA_MATRA if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_ucchaarana_vector(letter, abbrev))
for letter in letters
]
###############################################################################
def get_signature_letter(letter, abbrev=False):
"""
Get ucchaarana sthaana and prayatna based signature of a letter
Parameters
----------
letter : str
Sanskrit letter
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
signature : dict
utpatti sthaana, aabhyantara prayatna and baahya prayatna of a letter
"""
sthaana = get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)
aabhyantara = get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)
baahya = get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)
signature = {
'sthaana': sthaana,
'aabhyantara': aabhyantara,
'baahya': baahya
}
return signature
def get_signature_word(word, abbrev=False):
"""
Get ucchaarana sthaana and prayatna based signature of a word
Parameters
----------
word : str
Sanskrit word (or text)
Caution:
If multiple words are provided, the spaces are not included in
the output list
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
list
List of (letter, signature)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in EXTRA_MATRA if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_signature_letter(letter, abbrev))
for letter in letters
]
def get_signature(text, abbrev=False):
"""
Get ucchaarana list of a Sanskrit text
Parameters
----------
text : str
Sanskrit text (can contain newlines, spaces)
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
list
List of (letter, signature) for words in a nested list manner
Nesting Levels: Text -> Lines -> Words
"""
lines = split_lines(text.strip())
signature = []
for line in lines:
words = line.split()
line_signature = []
for word in words:
word_signature = get_signature_word(word, abbrev)
line_signature.append(word_signature)
signature.append(line_signature)
return signature
###############################################################################
def get_ucchaarana_letter(letter, dimension=0, abbrev=False):
"""
Get ucchaarana sthaana or prayatna of a letter
Parameters
----------
letter : str
Sanskrit letter
dimension : int
0 : sthaana
1 : aabhyantara prayatna
2 : baahya prayatna
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
str
ucchaarana sthaana or prayatna of a letter
"""
varna = letter.replace(HALANTA, '') if letter.endswith(HALANTA) else letter
ucchaarana = []
_UCCHAARANA = [STHAANA, AABHYANTARA, BAAHYA]
_NAMES = [STHAANA_NAMES, AABHYANTARA_NAMES, BAAHYA_NAMES]
if abbrev:
def ucchaarana_name(s):
return s
join_str = '-'
else:
def ucchaarana_name(s):
return _NAMES[dimension][s]
join_str = ' '
for s, varna_list in _UCCHAARANA[dimension].items():
if varna in varna_list:
ucchaarana.append(ucchaarana_name(s))
if len(ucchaarana) > 1 and not abbrev:
ucchaarana.append('च')
return join_str.join(ucchaarana)
def get_ucchaarana_word(word, dimension=0, abbrev=False):
"""
Get ucchaarana of a word
Parameters
----------
word : str
Sanskrit word (or text)
Caution:
If multiple words are provided, the spaces are not included in
the output list
dimension : int
0 : sthaana
1 : aabhyantara prayatna
2 : baahya prayatna
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
list
List of (letter, ucchaarana)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in EXTRA_MATRA if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_ucchaarana_letter(letter, dimension, abbrev))
for letter in letters
]
def get_ucchaarana(text, dimension=0, abbrev=False):
"""
Get ucchaarana list of a Sanskrit text
Parameters
----------
text : str
Sanskrit text (can contain newlines, spaces)
dimension : int
0 : sthaana
1 : aabhyantara prayatna
2 : baahya prayatna
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
list
List of (letter, ucchaarana) for words in a nested list manner
Nesting Levels: Text -> Lines -> Words
"""
lines = split_lines(text.strip())
ucchaarana = []
for line in lines:
words = line.split()
line_ucchaarana = []
for word in words:
word_ucchaarana = get_ucchaarana_word(word, dimension, abbrev)
line_ucchaarana.append(word_ucchaarana)
ucchaarana.append(line_ucchaarana)
return ucchaarana
###############################################################################
def get_sthaana_letter(letter, abbrev=False):
"""Wrapper for get_ucchaarana_letter for sthaana"""
return get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)
def get_sthaana_word(word, abbrev=False):
"""Wrapper for get_ucchaarana_word for sthaana"""
return get_ucchaarana_word(word, dimension=0, abbrev=abbrev)
def get_sthaana(text, abbrev=False):
"""Wrapper for get_ucchaarana for sthaana"""
return get_ucchaarana(text, dimension=0, abbrev=abbrev)
# --------------------------------------------------------------------------- #
def get_aabhyantara_letter(letter, abbrev=False):
"""Wrapper for get_ucchaarana_letter for aabhyantara"""
return get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)
def get_aabhyantara_word(word, abbrev=False):
"""Wrapper for get_ucchaarana_word for aabhyantara"""
return get_ucchaarana_word(word, dimension=1, abbrev=abbrev)
def get_aabhyantara(text, abbrev=False):
"""Wrapper for get_ucchaarana for aabhyantara"""
return get_ucchaarana(text, dimension=1, abbrev=abbrev)
# --------------------------------------------------------------------------- #
def get_baahya_letter(letter, abbrev=False):
"""Wrapper for get_ucchaarana_letter for baahya"""
return get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)
def get_baahya_word(word, abbrev=False):
"""Wrapper for get_ucchaarana_word for baahya"""
return get_ucchaarana_word(word, dimension=2, abbrev=abbrev)
def get_baahya(text, abbrev=False):
"""Wrapper for get_ucchaarana for baahya"""
return get_ucchaarana(text, dimension=2, abbrev=abbrev)
###############################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment