Last active
October 5, 2021 13:26
-
-
Save rishi-raj-jain/f58bc760b2df562dade01d2a3f049a11 to your computer and use it in GitHub Desktop.
Easier than ever to include hindi text in pdflatex-only files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Process.py | |
# The repaired version of https://gist.github.com/shreevatsa/4d476ac26a367fa68984d8c06867d7dd#file-get-dn-py | |
from __future__ import unicode_literals | |
import os | |
import re | |
import subprocess | |
import sys | |
consonants = { | |
0x0915: ['k'], | |
0x0916: ['kh'], | |
0x0917: ['g'], | |
0x0918: ['gh'], | |
0x0919: ['"n'], | |
0x091A: ['c'], | |
0x091B: ['ch'], | |
0x091C: ['j'], | |
0x091D: ['jh'], | |
0x091E: ['~n'], | |
0x091F: ['.t'], | |
0x0920: ['.th'], | |
0x0921: ['.d'], | |
0x0922: ['.dh'], | |
0x0923: ['.n'], | |
0x0924: ['t'], | |
0x0925: ['th'], | |
0x0926: ['d'], | |
0x0927: ['dh'], | |
0x0928: ['n'], | |
0x092A: ['p'], | |
0x092B: ['ph'], | |
0x092C: ['b'], | |
0x092D: ['bh'], | |
0x092E: ['m'], | |
0x092F: ['y'], | |
0x0930: ['r'], | |
0x0932: ['l'], | |
0x0933: ['L'], | |
0x0935: ['v'], | |
0x0936: ['"s'], | |
0x0937: ['.s'], | |
0x0938: ['s'], | |
0x0939: ['h'], | |
0x0958: ['q'], | |
0x0959: ['.kh'], | |
0x095A: ['.g'], | |
0x095B: ['z'], | |
0x095C: ['R'], | |
0x095D: ['Rh'], | |
0x095E: ['f'], | |
} | |
vowel_signs = { | |
0x093E: ['aa'], | |
0x093F: ['i'], | |
0x0940: ['ii'], | |
0x0941: ['u'], | |
0x0942: ['uu'], | |
0x0943: ['.r'], | |
0x0944: ['.R', '.r.r'], | |
0x0947: ['e'], | |
0x0948: ['ai'], | |
0x0949: ['~o'], | |
0x094B: ['o'], | |
0x094C: ['au'], | |
0x0962: ['.l'], | |
0x0963: ['.ll', '.l.l'], | |
} | |
vowels = { | |
0x0905: ['a'], | |
0x0906: ['aa'], | |
0x0907: ['i'], | |
0x0908: ['ii'], | |
0x0909: ['u'], | |
0x090A: ['uu'], | |
0x090B: ['.r'], | |
0x090C: ['.l'], | |
0x090F: ['e'], | |
0x0910: ['ai'], | |
0x0913: ['o'], | |
0x0914: ['au'], | |
0x0960: ['.R'], | |
0x0961: ['.L'], | |
0x0972: ['~a'], | |
} | |
other = { | |
# 0x002E: ['..'], | |
0x0901: ['/'], | |
0x0902: ['.m'], | |
0x0903: ['.h'], | |
0x093D: ['.a'], | |
0x094D: ['&'], | |
0x0950: ['.o'], | |
0x0964: ['|'], | |
0x0965: ['||'], | |
0x0966: ['0'], | |
0x0967: ['1'], | |
0x0968: ['2'], | |
0x0969: ['3'], | |
0x096A: ['4'], | |
0x096B: ['5'], | |
0x096C: ['6'], | |
0x096D: ['7'], | |
0x096E: ['8'], | |
0x096F: ['9'], | |
0x0970: ['@'], | |
0x0971: ['#'], | |
} | |
re_consonant = '|'.join(chr(n) for n in consonants) | |
re_vowel_sign = '|'.join(chr(n) for n in vowel_signs) | |
re_vowel = '|'.join(chr(n) for n in vowels) | |
re_other = '|'.join(chr(n) for n in other) | |
re_virama = chr(0x094D) | |
re_a = vowels[0x0905][0] # 'a' | |
def velthuis(devanagari): | |
text = devanagari | |
text = re.sub('(%s)(%s)' % (re_consonant, re_vowel_sign), | |
lambda match: consonants[ord(match.group(1))][0] + vowel_signs[ord(match.group(2))][0], | |
text) | |
text = re.sub('(%s)(%s)' % (re_consonant, re_virama), | |
lambda match: consonants[ord(match.group(1))][0], | |
text) | |
text = re.sub('(%s)' % re_consonant, | |
lambda match: consonants[ord(match.group(1))][0] + re_a, | |
text) | |
text = re.sub('(%s)' % re_vowel, | |
lambda match: vowels[ord(match.group(1))][0], | |
text) | |
text = re.sub('(%s)' % re_other, | |
lambda match: other[ord(match.group(1))][0], | |
text) | |
return text | |
def wikner(devanagari): | |
text = devanagari | |
text = re.sub('(%s)(%s)' % (re_consonant, re_vowel_sign), | |
lambda match: consonants[ord(match.group(1))][-1] + vowel_signs[ord(match.group(2))][-1], | |
text) | |
text = re.sub('(%s)(%s)' % (re_consonant, re_virama), | |
lambda match: consonants[ord(match.group(1))][-1], | |
text) | |
text = re.sub('(%s)' % re_consonant, | |
lambda match: consonants[ord(match.group(1))][-1] + re_a, | |
text) | |
text = re.sub('(%s)' % re_vowel, | |
lambda match: vowels[ord(match.group(1))][-1], | |
text) | |
text = re.sub('(%s)' % re_other, | |
lambda match: other[ord(match.group(1))][-1], | |
text) | |
return text | |
random_filename = 'lwfzal3XBeV8H10I8f4n' | |
# Added support of nukta from https://gist.github.com/ritwikmishra/9f8d6de45aff8fbe959d4260269d9eeb | |
nukta_dict = { | |
'क़': 'क़', | |
'ख़': 'ख़', | |
'ग़': 'ग़', | |
'ज़': 'ज़', | |
'ड़': 'ड़', | |
'ढ़': 'ढ़', | |
'फ़': 'फ़', | |
'य़': 'य़', | |
'ऱ': 'ऱ', | |
'ऴ': 'ऴ' | |
} | |
def get_preprocessed(filename, ext): | |
preprocessor = { | |
'dn': 'devnag', | |
'skt': './skt', | |
} | |
assert ext in preprocessor.keys(), ext | |
text = open(filename).read() | |
for k in nukta_dict.keys(): | |
text = text.replace(k,nukta_dict[k]) | |
transliterated = velthuis(text) if ext == 'dn' else wikner(text) | |
infile = '%s-%s.%s' % (random_filename, ext, ext) | |
open(infile, 'w').write(r'{\%s %s}' % (ext, transliterated)) | |
p = subprocess.Popen([preprocessor[ext], infile], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
close_fds=True) | |
out, err, ret = p.stdout.read(), p.stderr.read(), p.returncode | |
if err or ret: | |
print ('input: <%s>' % text.encode('utf-8')) | |
print ('transliterated: <%s>' % transliterated) | |
print ('stdout: <%s>' % out) | |
print ('stderr: <%s>' % err) | |
print ('returned: <%s>' % ret) | |
raise ValueError | |
outfile = '%s-%s.tex' % (random_filename, ext) | |
translation = open(outfile).read() | |
os.remove(outfile) | |
os.remove(infile) | |
prefix = r'\def\DevnagVersion{2.15}{\dn ' if ext == 'dn' else r'{\skt ' | |
translation = translation[len(prefix):-1] | |
return translation | |
ext= 'dn' # Compiler format | |
filename= '../input/santshr/name.dn' # Source of the file | |
print(get_preprocessed(filename, ext)) # PdfLaTeX ready content |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for that ✨
With kaggle, you don't need to even run a compiler.