Created
February 20, 2012 10:13
-
-
Save srikanthlogic/1868666 to your computer and use it in GitHub Desktop.
Files for Tamil-English Reverse Transliterator https://github.com/santhoshtr/silpa
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Any Indian Language to any other Indian language transliterator | |
# Copyright 2008-2010 Santhosh Thottingal <[email protected]> | |
# http://www.smc.org.in | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Library General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
# | |
# If you find any bugs or have any suggestions email: [email protected] | |
# URL: http://www.smc.org.in | |
import string | |
import os | |
CMU_MALAYALAM_MAP = { | |
"AA" : "à´“", | |
"AH" : "à´…", | |
"AE" : "à´", | |
"AO" : "à´“", | |
"AW" : "à´”", | |
"AY" : "à´", | |
"B" : "à´¬àµ", | |
"CH" : "à´šàµà´šàµ", | |
"D" : "à´¡àµ", | |
"DH" : "à´¦àµ", | |
"EA" : "à´ˆ", | |
"EH" : "à´Ž", | |
"ER" : "à´Žà´°àµâ€", | |
"EY" : "à´Žà´¯àµ", | |
"F" : "à´«àµ", | |
"G" : "à´—àµ", | |
"HH" : "à´¹àµ", | |
"IH" : "à´‡", | |
"IY" : "à´ˆ", | |
"J" : "à´œàµ", | |
"JH" : "à´œàµ", | |
"K" : "à´•àµ", | |
"L" : "à´²àµâ€", | |
"M" : "à´®àµ", | |
"N" : "à´¨àµâ€", | |
"NG" : "à´™àµ", | |
"OW" : "à´’", | |
"P" : "à´ªàµ", | |
"R" : "à´°àµâ€", | |
"S" : "à´¸àµ", | |
"SH" : "à´·àµ", | |
"T" : "à´±àµà´±àµ", | |
"TH" : "à´¤àµ", | |
"Y" : "à´¯àµ", | |
"UW" : "à´‰", | |
"W" : "à´µàµ", | |
"V" : "à´µàµ", | |
"Z" : "à´¸àµ", | |
} | |
CMU_TAMIL_MAP = { | |
"AA" : "ஆ", | |
"AH" : "à®…", | |
"AE" : "எ", | |
"AO" : "à®’", | |
"AW" : "à®”", | |
"AY" : "à®", | |
"B" : "பி", | |
"CH" : "சà¯", | |
"D" : "டà¯", | |
"DH" : "தà¯", | |
"EA" : "à´ˆ", | |
"EH" : "à®", | |
"ER" : "à®…à®°à¯", | |
"EY" : "à®", | |
"F" : "ஃபà¯", | |
"G" : "கà¯", | |
"HH" : "ஹà¯", | |
"IH" : "இ", | |
"IY" : "இ", | |
"J" : "ஜà¯", | |
"JH" : "ஜà¯", | |
"K" : "கà¯", | |
"L" : "லà¯", | |
"M" : "à®®à¯", | |
"N" : "னà¯", | |
"NG" : "à®™à¯", | |
"OW" : "à®”", | |
"P" : "பà¯", | |
"R" : "à®°à¯", | |
"S" : "சà¯", | |
"SH" : "à®·à¯", | |
"T" : "டà¯", | |
"TH" : "தà¯", | |
"Y" : "யà¯", | |
"UW" : "உ", | |
"W" : "வ", | |
"V" : "வ", | |
"Z" : "ஸà¯", | |
} | |
CMU_KANNADA_MAP = { | |
"AA" : "ಆ", | |
"AH" : "ಅ", | |
"AE" : "à²", | |
"AO" : "ಓ", | |
"AW" : "ಔ", | |
"AY" : "à²", | |
"B" : "ಬà³", | |
"CH" : "ಚà³", | |
"D" : "ಡà³", | |
"DH" : "ದà³", | |
"EA" : "ಈ", | |
"EH" : "ಎ", | |
"ER" : "ಅರà³", | |
"EY" : "ಎಯà³", | |
"F" : "ಫà³", | |
"G" : "ಗà³", | |
"HH" : "ಹà³", | |
"IH" : "ಇ", | |
"IY" : "ಈ", | |
"J" : "ಜà³", | |
"JH" : "ಜà³", | |
"K" : "ಕà³", | |
"L" : "ಲà³", | |
"M" : "ಮà³", | |
"N" : "ನà³", | |
"NG" : "ಂಗà³", | |
"OW" : "ಒ", | |
"P" : "ಪà³", | |
"R" : "ರà³", | |
"S" : "ಸà³", | |
"SH" : "ಷà³", | |
"T" : "ಟà³", | |
"TH" : "ತà³", | |
"Y" : "ಯà³", | |
"UW" : "ಊ", | |
"UH":"ಉ", | |
"W" : "ವà³", | |
"V" : "ವà³", | |
"Z":"ಸà³", | |
"ZH":"ಷà³", | |
} | |
class CMUDict(): | |
def __init__(self): | |
self.dictionaryfile=os.path.join(os.path.dirname(__file__), 'cmudict.0.7a_SPHINX_40') | |
self.cmudictionary = None | |
def load(self): | |
fdict = open(self.dictionaryfile, "r") | |
flines = fdict.readlines() | |
linecount = len(flines) | |
self.cmudictionary = dict() | |
for line in flines: | |
line = line.strip() | |
lhs = line.split()[0] | |
rhs = line.split()[1:] | |
self.cmudictionary[lhs] = rhs | |
def find(self, word): | |
if self.cmudictionary== None: | |
self.load() | |
return self.cmudictionary[word.upper()] | |
def pronunciation(self,word, language): | |
stripped_word = word.strip('!,.?:') | |
punctuations = word[len(stripped_word):] | |
try: | |
cmu_pronunciation = self.find(stripped_word) | |
except KeyError: | |
#print "could not find the word " + stripped_word + " in dictionary" | |
return word | |
pronunciation_str = "" | |
if language =="ml_IN": | |
for syl in cmu_pronunciation: | |
try: | |
pronunciation_str += CMU_MALAYALAM_MAP[syl] | |
except KeyError: | |
pronunciation_str += syl | |
pronunciation_str = self._fix_vowel_signs_ml(pronunciation_str) | |
if language == "kn_IN": | |
for symbol in cmu_pronunciation: | |
try: | |
pronunciation_str += CMU_KANNADA_MAP[symbol] | |
except KeyError: | |
pronunciation_str += symbol | |
pronunciation_str = self._fix_vowel_signs_kn(pronunciation_str) | |
return (pronunciation_str).decode("utf-8") + punctuations | |
if language == "ta_IN": | |
for symbol in cmu_pronunciation: | |
try: | |
pronunciation_str += CMU_TAMIL_MAP[symbol] | |
except KeyError: | |
pronunciation_str += symbol | |
#pronunciation_str = self._fix_vowel_signs_kn(pronunciation_str) | |
return (pronunciation_str).decode("utf-8") + punctuations | |
def _fix_vowel_signs_ml(self,text) : | |
text= text.replace("àµà´…","") | |
text= text.replace("àµâ€à´…","") | |
text= text.replace("àµà´†","à´¾") | |
text= text.replace("àµâ€à´†","à´¾") | |
text= text.replace("àµà´‡","à´¿") | |
text= text.replace("àµâ€à´‡","à´¿") | |
text= text.replace("àµà´ˆ","ീ") | |
text= text.replace("àµâ€à´ˆ","ീ") | |
text= text.replace("àµà´‰","àµ") | |
text= text.replace("àµâ€à´‰","àµ") | |
text= text.replace("àµà´Š","ൂ") | |
text= text.replace("àµâ€à´Š","ൂ") | |
text= text.replace("àµà´±","àµà´°") | |
text= text.replace("àµà´Ž","െ") | |
text= text.replace("àµâ€à´Ž","") | |
text= text.replace("àµà´","േ") | |
text= text.replace("àµâ€à´","േ") | |
text= text.replace("àµà´","ൈ") | |
text= text.replace("àµâ€à´","ൈ") | |
text= text.replace("àµà´’","ൊ") | |
text= text.replace("àµâ€à´’","ൊ") | |
text= text.replace("àµà´“","ോ") | |
text= text.replace("àµâ€à´“","ോ") | |
text= text.replace("àµà´”","ൌ") | |
text= text.replace("àµâ€à´”","ൌ") | |
text= text.replace("à´°àµà´°","à´±àµà´±") | |
text= text.replace("à´±àµà´°","à´±àµà´±") | |
text= text.replace("à´¨àµâ€à´±àµà´±","à´¨àµà´±") | |
return text | |
def _fix_vowel_signs_kn(self,text) : | |
text= text.replace("à³à²…","") | |
text= text.replace("à³à²†","ಾ") | |
text= text.replace("à³à²‡","ಿ") | |
text= text.replace("à³à²ˆ","à³€") | |
text= text.replace("à³à²‰","à³") | |
text= text.replace("à³à²Š","ೂ") | |
text= text.replace("à³à²‹","ೃ") | |
text= text.replace("à³à²Ž","ೆ") | |
text= text.replace("à³à²","ೇ") | |
text= text.replace("à³à²","ೈ") | |
text= text.replace("à³à²’","ೊ") | |
text= text.replace("à³à²“","ೋ") | |
text= text.replace("à³à²”","ೌ") | |
return text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
#indic_en.py | |
# | |
#Copyright 2010 Vasudev Kamath <[email protected]> | |
# | |
#This program is free software; you can redistribute it and/or modify | |
#it under the terms of the GNU General Public License as published by | |
#the Free Software Foundation; either version 3 of the License, or | |
#(at your option) any later version. | |
# | |
#This program is distributed in the hope that it will be useful, | |
#but WITHOUT ANY WARRANTY; without even the implied warranty of | |
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
#GNU General Public License for more details. | |
# | |
#You should have received a copy of the GNU General Public License | |
#along with this program; if not, write to the Free Software | |
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
#MA 02110-1301, USA. | |
# | |
''' | |
This file contains all language related dictionaries vowel and vowel signs | |
and function which returns a dictionary or vowel or vowel signs for a language | |
Trying to make indic_en transliteration more generic | |
''' | |
kannada_english_dict = {u'ಅ':'a',u'ಆ':'aa',u'ಇ':'i',u'ಈ':'i',u'ಉ':'u',\ | |
u'ಊ':'u',u'ಋ':'rri',u'ಎ':'e',u'à²':'e',u'à²':'ai',\ | |
u'ಒ':'o',u'ಓ':'o',u'ಔ':'au',u'ಂ':'m',u'ಃ':'h',\ | |
u'ಕ':'k',u'ಖ':'kh',u'ಗ':'g',u'ಘ':'gh',u'ಙ':'ng',\ | |
u'ಚ':'ch',u'ಛ':'chh',u'ಜ':'j',u'à²':'jhh',u'ಞ':'nj',\ | |
u'ತ':'th',u'ಥ':'thh',u'ದ':'d',u'ಧ':'dh',u'ನ':'n',\ | |
u'ಟ':'T',u'ಠ':'Th',u'ಡ':'D',u'ಢ':'Dh',u'ಣ':'N',\ | |
u'ಪ':'p',u'ಫ':'ph',u'ಬ':'b',u'à²':'bh',u'ಮ':'m',\ | |
u'ಯ':'y',u'ರ':'r',u'ಲ':'l',u'ವ':'v',u'ಶ':'sh',\ | |
u'ಷ':'shh',u'ಸ':'s',u'ಹ':'h',u'ಳ':'L',\ | |
u'ಋ':'rri',u'à³':'',u'ಾ':'aa',u'ಿ':'i',u'à³€':'i',\ | |
u'à³':'u',u'ೂ':'u',u'ೃ':'rri',u'ೆ':'e',u'ೇ':'e',\ | |
u'ೈ':'ai',u'ೊ':'o',u'ೋ':'o',u'ೌ':'au',\ | |
u'ಕà³à²·':'ksh',u'ತà³à²°':'tr',u'ಜà³à²ž':'jn',\ | |
u'೧':'1',u'೨':'2',u'೩':'3',u'೪':'4',u'೫':'5',\ | |
u'೬':'6',u'à³':'7',u'à³®':'8',u'೯':'9',u'೦':'0'} | |
kn_vowels = [u'ಅ',u'ಆ',u'ಇ',u'ಈ',u'ಉ',u'ಊ',u'ಋ',u'ಎ',u'à²',u'à²',\ | |
u'ಒ',u'ಓ',u'ಔ'] | |
kn_vowel_signs = [u'à³',u'ಾ',u'ಿ',u'à³€',u'à³',u'ೂ',u'ೃ',u'ೆ',u'ೇ',\ | |
u'ೈ',u'ೊ',u'ೋ',u'ೌ',u'ಂ',u'ಃ',u' '] | |
tamil_english_dict = {u'அ':'a',u'ஆ':'aa',u'இ':'i',u'ஈ':'ii',u'உ':'u',u'ஊ':'uu',u'எ':'e',\ | |
u'à®':'ee',u'à®':'ai',u'à®’':'o',u'ஓ':'oo',u'à®”':'au',\ | |
u'கà¯':'k', u'à®™à¯':'ng', u'சà¯':'s','ஞà¯':'nj', u'டà¯':'d', u'ணà¯':'N', u'தà¯':'th', u'நà¯':'w',\ | |
u'பà¯':'p', u'à®®à¯':'m', u'யà¯':'y', u'à®°à¯':'r', u'லà¯':'l', u'வà¯':'v', u'à®´à¯':'zh', u'ளà¯':'L', u'à®±à¯':'R', u'னà¯':'n'} | |
tamil_vowels = [u'அ',u'ஆ',u'இ',u'ஈ',u'உ',u'ஊ',u'எ',\ | |
u'à®',u'à®',u'à®’',u'ஓ',u'à®”',u'ஃ'] | |
tamil_vowel_signs = [u'ா',u'ி',u'ீ',u'à¯',u'ூ',u'ெ',u'ே',u'ை',u'ொ',u'ோ',u'ௌ',u'à¯'] | |
malayalam_english_dict={u'à´…':'a',u'à´†':'aa',u'à´‡':'i',u'à´ˆ':'ee',u'à´‰':'u',u'à´Š':'oo',u'à´‹':'ri',\ | |
u'à´Ž':'e',u'à´':'e',u'à´':'ai',u'à´’':'o',u'à´“':'o',u'à´”':'au',\ | |
u'à´•':'k',u'à´–':'kh',u'à´—':'g',u'à´˜':'gh',u'à´™àµà´™':'ng',u'à´™':'ng',\ | |
u'à´š':'ch',u'à´›':'chh',u'à´œ':'j',u'à´':'jhh',u'à´ž':'nj',\ | |
u'à´Ÿ':'t',u'à´ ':'th',u'à´¡':'d',u'à´¢':'dh',u'à´£':'n',\ | |
u'à´¤':'th',u'à´¥':'th',u'à´¦':'d',u'à´§':'dh',u'à´¨':'n',\ | |
u'à´ª':'p',u'à´«':'ph',u'à´¬':'b',u'à´':'bh',u'à´®':'m',\ | |
u'à´¯':'y',u'à´°':'r',u'à´²':'l', u'à´µ':'v', u'à´±':'r',\ | |
u'à´¶':'s',u'à´·':'sh',u'à´¸':'s', u'à´¹':'h',u'à´³':'l',u'à´´':'zh',\ | |
u'àµ':'',u'à´‚':'m',u'à´¾':'aa',u'à´¿':'i' ,u'ീ':'ee' ,u'àµ':'u',\ | |
u'ൂ':'oo',u'ൃ':'ri' ,u'െ':'e' ,u'േ':'e',\ | |
u'ൈ':'ai',u'ൊ':'o' ,u'ോ':'oo' ,u'ൗ':'au', u'ൌ':'ou'} | |
ml_vowels = [u'à´…',u'à´†',u'à´‡',u'à´ˆ',u'à´‰' ,u'à´Š',u'à´‹', u'à´Ž',u'à´',u'à´',\ | |
u'à´’',u'à´“',u'à´”'] | |
ml_vowel_signs = [u'àµ',u'à´‚',u'à´¾',u'à´¿',u'ീ',u'àµ', u'ൂ',u'ൃ' ,u'െ' ,u'േ',\ | |
u'ൈ',u'ൊ' ,u'ോ' ,u'ൗ' , u'ൌ',u'â€'] | |
# P.S: Please declare all language related variables above this and | |
# fill in the following mapping as you add dictionary vowels and | |
# vowel_signs for your language | |
# language dictionary mapping | |
language_dictionary = {"kn_IN":kannada_english_dict,\ | |
"ml_IN":malayalam_english_dict} | |
# language vowels mapping | |
language_vowels = {"kn_IN":kn_vowels,"ml_IN":ml_vowels} | |
# language vowel signs mapping | |
language_vowel_signs = {"kn_IN":kn_vowel_signs,\ | |
"ml_IN":ml_vowel_signs} | |
# language virama sign mapping | |
language_virama = {"kn_IN":u"à³","ml_IN":u"àµ"} | |
# language anuswara sign mapping | |
language_anuswara = {"kn_IN":u"ಂ","ml_IN":u'ം'} | |
def get_dictionary_for(lang="ml_IN"): | |
""" | |
Returns the 'language'_english_dict if there | |
is no dictionary available for a language then | |
return ml_IN dictionary | |
i.e cycle through language -> ml_IN -> en_US | |
Arguments: | |
- `lang`: Language for which dictionary is required | |
""" | |
return language_dictionary.get(lang,"ml_IN") | |
def get_vowels_for(lang="ml_IN"): | |
""" | |
Returns the 'lang'_vowels list. If vowel list | |
is not available for a language retrun list for | |
ml_IN | |
Arguments: | |
- `lang`: Language for which vowel list should be returned | |
""" | |
return language_vowels.get(lang,"ml_IN") | |
def get_vowel_signs_for(lang="ml_IN"): | |
""" | |
Returns the 'lang'_vowels list. If vowel list | |
is not available for a language retrun list for | |
ml_IN | |
Arguments: | |
- `lang`: Language for which vowel signs list should be returned | |
""" | |
return language_vowel_signs.get(lang,"ml_IN") | |
def get_virama_for(lang="ml_IN"): | |
""" | |
Return the virama symbol for given language | |
Arguments: | |
- `lang`: Language for which virama symbol should be returned | |
""" | |
return language_virama.get(lang,"ml_IN") | |
def get_anuswara_for(lang="ml_IN"): | |
""" | |
Return the anuswara symbol for the language | |
Arguments: | |
- `lang`: Language for which anuswara symbol is needed | |
""" | |
return language_anuswara.get(lang,"ml_IN") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Any Indian Language to any other Indian language transliterator | |
# Copyright 2009-2010 Santhosh Thottingal <[email protected]> | |
# http://www.smc.org.in | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU Lesser General Public License as published by | |
# the Free Software Foundation; either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
# | |
# If you find any bugs or have any suggestions | |
# email: [email protected] | |
# URL: http://www.smc.org.in | |
from common import * | |
from utils import * | |
import string | |
import os | |
from cmudict import CMUDict | |
from indic_en import * | |
class Transliterator(SilpaModule): | |
def __init__(self): | |
self.template=os.path.join(os.path.dirname(__file__),\ | |
'transliterate.html') | |
self.cmu = CMUDict() | |
self.response = SilpaResponse(self.template) | |
def transliterate_en_ml(self, word): | |
""" | |
Transliterate English to Malayalam with the help of | |
CMU pronuciation dictionary | |
""" | |
return self.cmu.pronunciation(word,"ml_IN") | |
def transliterate_en_kn(self, word): | |
""" | |
Transliterate English to Kannada with the help of | |
CMU pronuciation dictionary | |
""" | |
return self.cmu.pronunciation(word,"kn_IN") | |
def transliterate_en_ta(self, word): | |
""" | |
Transliterate English to Tamil with the help of | |
CMU pronuciation dictionary | |
""" | |
return self.cmu.pronunciation(word,"ta_IN") | |
def transliterate_en_xx(self,word, target_lang): | |
""" | |
Transliterate English to any Indian Language. | |
""" | |
if target_lang=="en_IN" or target_lang=="en_US": | |
return word | |
if target_lang == "kn_IN": | |
tx_str = self.transliterate_en_kn(word) | |
if target_lang == "ta_IN": | |
tx_str = self.transliterate_en_ta(word) | |
return tx_str | |
else: | |
tx_str = self.transliterate_en_ml(word) | |
if target_lang == "ml_IN": | |
return tx_str | |
#chain it through indic indic transliteratioin | |
#first remove malayalam specific zwj | |
tx_str = tx_str.replace(u'â€', '') # remove instances of zwnj | |
if tx_str[-1:] == u'àµ' and (target_lang == "hi_IN"\ | |
or target_lang == "gu_IN"\ | |
or target_lang == "bn_IN" ) : | |
tx_str = tx_str[:-(len(u'àµ'))] #remove the last virama' | |
return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang) | |
def transliterate_xx_en(self,word, src_lang): | |
""" | |
Transliterate Indian Language to English. | |
""" | |
if src_lang == "en_IN" or src_lang == "en_US": | |
return word | |
# TODO: the function is generic now so no need of testing the lanuguage | |
# but since the indic_en contains only for kn_IN and ml_IN we need this | |
# check. | |
# Add all indic language to indic_en | |
# remplace this block with single call to indic_en function | |
if src_lang == "kn_IN": | |
return self.transliterate_indic_en(word,src_lang) | |
if not src_lang == "ml_IN": | |
word = self.transliterate_indic_indic(word, src_lang, "ml_IN") | |
return self.transliterate_indic_en(word,"ml_IN") | |
def transliterate_iso15919(self, word, src_language): | |
tx_str = "" | |
index=0; | |
word_length = len(word) | |
for chr in word: | |
index+=1 | |
offset = ord(chr) - lang_bases[src_language] | |
#76 is the virama offset for all indian languages from its base | |
if offset >= 61 and offset <=76: | |
tx_str = tx_str[:-1] #remove the last 'a' | |
if offset>0 and offset<=128: | |
tx_str = tx_str + charmap["ISO15919"][offset] | |
#delete the inherent 'a' at the end of the word from hindi | |
if tx_str[-1:]=='a' and (src_language == "hi_IN"\ | |
or src_language == "gu_IN"\ | |
or src_language == "bn_IN" ) : | |
if word_length == index and word_length>1: #if last letter | |
tx_str = tx_str[:-1] #remove the last 'a' | |
return tx_str .decode("utf-8") | |
def transliterate_ipa(self, word, src_language): | |
""" | |
Transliterate the given word in src_language to | |
IPA - International Phonetical Alphabet notation. | |
""" | |
tx_str = "" | |
index=0; | |
word_length = len(word) | |
for chr in word: | |
index+=1 | |
if ord(chr) < 255 : #ASCII characters + English | |
tx_str += chr | |
continue | |
offset = ord(chr) - lang_bases[src_language] | |
#76 is the virama offset for all indian languages from its base | |
if offset >= 61 and offset <=76: | |
tx_str = tx_str[:-(len('É™'))] #remove the last 'É™' | |
if offset>0 and offset<=128: | |
tx_str = tx_str + charmap["IPA"][offset] | |
#delete the inherent 'a' at the end of the word from hindi | |
if tx_str[-1:]=='É™' and (src_language == "hi_IN"\ | |
or src_language == "gu_IN"\ | |
or src_language == "bn_IN" ) : | |
if word_length == index and word_length>1: #if last letter | |
tx_str = tx_str[:-(len('É™'))] #remove the last 'a' | |
return tx_str .decode("utf-8") | |
def _malayalam_fixes(self, text): | |
try: | |
text = text.replace(u"മൠ",u"ം ") | |
text = text.replace(u"à´®àµ,",u"à´‚,") | |
text = text.replace(u"à´®àµ.",u"à´‚.") | |
text = text.replace(u"à´®àµ)",u"à´‚)") | |
text = text.replace(u"à´©",u"à´¨") | |
text = text.replace(u"൤",u".") #danda by fullstop | |
except: | |
pass | |
return text | |
def transliterate_indic_indic(self, word, src_lang, target_lang) : | |
""" | |
Transliterate from an Indian languge word | |
to another indian language word | |
""" | |
index = 0 | |
tx_str = "" | |
word = normalizer.normalize(word) | |
if src_lang == "ml_IN" and target_lang != "ml_IN" : | |
word = word.replace(u"\u200C",u"") | |
word = word.replace(u"\u200D",u"") | |
#replace all samvruthokaram by u vowels | |
word = word.replace(u"àµàµ",u"") | |
for chr in word: | |
index += 1 | |
if chr in string.punctuation or (ord(chr)<=2304 and ord(chr)>=3071): | |
tx_str = tx_str + chr | |
continue | |
offset = ord(chr) + self.getOffset(src_lang, target_lang) | |
if(offset>0): | |
tx_str = tx_str + unichr (offset) | |
#schwa deletion | |
baseoffset = offset - lang_bases[target_lang] | |
#76 : virama | |
if (index == len(word) | |
and baseoffset == 76 | |
and ( target_lang == "hi_IN" or | |
target_lang == "gu_IN" or | |
target_lang == "pa_IN" or | |
target_lang == "bn_IN")) : | |
#TODO Add more languages having schwa deletion characteristic | |
tx_str = tx_str[:-(len(chr))] #remove the last 'a' | |
if target_lang == "ml_IN" and src_lang == "ta_IN": | |
tx_str = tx_str.replace(u"à´©" , u"à´¨") | |
if target_lang == "ta_IN": | |
tx_str = tx_str.replace(u'\u0B96' , u"க") | |
tx_str = tx_str.replace(u'\u0B97' , u"க") | |
tx_str = tx_str.replace(u'\u0B98' , u"க") | |
tx_str = tx_str.replace(u'\u0B9B' , u"ச") | |
tx_str = tx_str.replace(u'\u0B9D' , u"ச") | |
tx_str = tx_str.replace(u'\u0BA0' , u"ட") | |
tx_str = tx_str.replace(u'\u0BA1' , u"ட") | |
tx_str = tx_str.replace(u'\u0BA2' , u"ட") | |
tx_str = tx_str.replace(u'\u0BA5' , u"த") | |
tx_str = tx_str.replace(u'\u0BA6' , u"த") | |
tx_str = tx_str.replace(u'\u0BA7' , u"த") | |
tx_str = tx_str.replace(u'\u0BAB' , u"ப") | |
tx_str = tx_str.replace(u'\u0BAC' , u"ப") | |
tx_str = tx_str.replace(u'\u0BAD' , u"ப") | |
tx_str = tx_str.replace(u'\u0BC3' , u"ிரà¯") | |
tx_str = tx_str.replace(u'ஂ',u'à®®à¯') | |
#If target is malayalam, we need to add the virama | |
if ( (target_lang == "ml_IN") | |
and (src_lang == "hi_IN" or | |
src_lang == "gu_IN" or | |
src_lang == "pa_IN" or | |
src_lang == "bn_IN") | |
and tx_str[-1].isalpha() | |
): | |
tx_str = tx_str+u"àµ" | |
return tx_str | |
def transliterate_indic_en(self,word,src_lang): | |
""" | |
Arguments: | |
- `self`: | |
- `word`: Word to be transliterated (sentence) | |
- `src_lang`: Language from which we need to transilterate | |
""" | |
# Get all the language related stuffs | |
dictionary = get_dictionary_for(src_lang) | |
vowels = get_vowels_for(src_lang) | |
vowel_signs = get_vowel_signs_for(src_lang) | |
virama = get_virama_for(src_lang) | |
anuswara = get_anuswara_for(src_lang) | |
word_length = len(word) | |
index = 0 | |
tx_string = "" | |
while index < word_length: | |
# If current charachter is a punctuation symbol | |
# skip it. | |
# Added to avoid getting extra 'a' to the begining | |
# of word next to punctuation symbol | |
# | |
if word[index] in string.punctuation: | |
tx_string += word[index] | |
index += 1 | |
continue | |
# Virama = conjucter | |
if word[index] == virama: | |
index+=1 | |
continue; | |
# Get english equivalaent of the charachter. | |
try: | |
tx_string += dictionary[word[index]] | |
except KeyError: | |
# If charachter isn't present in the dict | |
# just append the charachter to string | |
# This case is now handled by punctuation checking | |
tx_string += word[index] | |
if index+1 < word_length and not word[index+1] in vowel_signs\ | |
and word[index+1] in dictionary \ | |
and not word[index] in vowels\ | |
and not word[index] in vowel_signs : | |
tx_string +='a' | |
if index+1 == word_length and not word[index] in vowel_signs\ | |
and word[index] in dictionary: | |
tx_string +='a' | |
#handle am sign | |
if index+1 < word_length and word[index+1] == anuswara\ | |
and not word[index] in vowel_signs: | |
tx_string += 'a' | |
index+=1 | |
return tx_string | |
@ServiceMethod | |
def transliterate(self,text, target_lang_code): | |
tx_str="" | |
lines=text.split("\n") | |
for line in lines: | |
words=line.split(" ") | |
for word in words: | |
if(word.strip()>""): | |
try: | |
src_lang_code=detect_lang(word)[word] | |
except: | |
tx_str = tx_str + " " + word | |
continue #FIXME | |
if target_lang_code=="ISO15919" : | |
tx_str=tx_str + \ | |
self.transliterate_iso15919(word, src_lang_code)\ | |
+ " " | |
continue | |
if target_lang_code=="IPA" : | |
tx_str=tx_str + \ | |
self.transliterate_ipa(word, src_lang_code) + " " | |
continue | |
if src_lang_code=="en_US" : | |
tx_str = tx_str + \ | |
self.transliterate_en_xx(word, target_lang_code)+" " | |
continue | |
if target_lang_code=="en_US" or target_lang_code=="en_IN" : | |
tx_str=tx_str + \ | |
self.transliterate_xx_en(word, src_lang_code) + " " | |
continue | |
tx_str += self.transliterate_indic_indic(word,\ | |
src_lang_code,\ | |
target_lang_code) | |
if len(lines)>1: | |
tx_str += " " | |
else: | |
tx_str = tx_str + word | |
if len(lines)>1: | |
tx_str += "\n" | |
# Language specific fixes | |
if target_lang_code == "ml_IN": | |
tx_str = self._malayalam_fixes(tx_str) | |
return tx_str | |
def getOffset(self,src,target): | |
src_id=0 | |
target_id=0 | |
try: | |
src_id=lang_bases[src] | |
target_id=lang_bases[target] | |
return (target_id - src_id) | |
except: | |
return 0 | |
def get_module_name(self): | |
return "Transliterator" | |
def get_info(self): | |
return "Transliterate the text between any Indian Language" | |
def getInstance(): | |
return Transliterator() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Boss putting 4/5 files as a gist is too much OK. this should be a repo only.