Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rishi-raj-jain/f58bc760b2df562dade01d2a3f049a11 to your computer and use it in GitHub Desktop.
Save rishi-raj-jain/f58bc760b2df562dade01d2a3f049a11 to your computer and use it in GitHub Desktop.
Easier than ever to include hindi text in pdflatex-only files
# Process.py
# The repaired version of https://gist.github.com/shreevatsa/4d476ac26a367fa68984d8c06867d7dd#file-get-dn-py
from __future__ import unicode_literals
import os
import re
import subprocess
import sys
consonants = {
0x0915: ['k'],
0x0916: ['kh'],
0x0917: ['g'],
0x0918: ['gh'],
0x0919: ['"n'],
0x091A: ['c'],
0x091B: ['ch'],
0x091C: ['j'],
0x091D: ['jh'],
0x091E: ['~n'],
0x091F: ['.t'],
0x0920: ['.th'],
0x0921: ['.d'],
0x0922: ['.dh'],
0x0923: ['.n'],
0x0924: ['t'],
0x0925: ['th'],
0x0926: ['d'],
0x0927: ['dh'],
0x0928: ['n'],
0x092A: ['p'],
0x092B: ['ph'],
0x092C: ['b'],
0x092D: ['bh'],
0x092E: ['m'],
0x092F: ['y'],
0x0930: ['r'],
0x0932: ['l'],
0x0933: ['L'],
0x0935: ['v'],
0x0936: ['"s'],
0x0937: ['.s'],
0x0938: ['s'],
0x0939: ['h'],
0x0958: ['q'],
0x0959: ['.kh'],
0x095A: ['.g'],
0x095B: ['z'],
0x095C: ['R'],
0x095D: ['Rh'],
0x095E: ['f'],
}
vowel_signs = {
0x093E: ['aa'],
0x093F: ['i'],
0x0940: ['ii'],
0x0941: ['u'],
0x0942: ['uu'],
0x0943: ['.r'],
0x0944: ['.R', '.r.r'],
0x0947: ['e'],
0x0948: ['ai'],
0x0949: ['~o'],
0x094B: ['o'],
0x094C: ['au'],
0x0962: ['.l'],
0x0963: ['.ll', '.l.l'],
}
vowels = {
0x0905: ['a'],
0x0906: ['aa'],
0x0907: ['i'],
0x0908: ['ii'],
0x0909: ['u'],
0x090A: ['uu'],
0x090B: ['.r'],
0x090C: ['.l'],
0x090F: ['e'],
0x0910: ['ai'],
0x0913: ['o'],
0x0914: ['au'],
0x0960: ['.R'],
0x0961: ['.L'],
0x0972: ['~a'],
}
other = {
# 0x002E: ['..'],
0x0901: ['/'],
0x0902: ['.m'],
0x0903: ['.h'],
0x093D: ['.a'],
0x094D: ['&'],
0x0950: ['.o'],
0x0964: ['|'],
0x0965: ['||'],
0x0966: ['0'],
0x0967: ['1'],
0x0968: ['2'],
0x0969: ['3'],
0x096A: ['4'],
0x096B: ['5'],
0x096C: ['6'],
0x096D: ['7'],
0x096E: ['8'],
0x096F: ['9'],
0x0970: ['@'],
0x0971: ['#'],
}
re_consonant = '|'.join(chr(n) for n in consonants)
re_vowel_sign = '|'.join(chr(n) for n in vowel_signs)
re_vowel = '|'.join(chr(n) for n in vowels)
re_other = '|'.join(chr(n) for n in other)
re_virama = chr(0x094D)
re_a = vowels[0x0905][0] # 'a'
def velthuis(devanagari):
text = devanagari
text = re.sub('(%s)(%s)' % (re_consonant, re_vowel_sign),
lambda match: consonants[ord(match.group(1))][0] + vowel_signs[ord(match.group(2))][0],
text)
text = re.sub('(%s)(%s)' % (re_consonant, re_virama),
lambda match: consonants[ord(match.group(1))][0],
text)
text = re.sub('(%s)' % re_consonant,
lambda match: consonants[ord(match.group(1))][0] + re_a,
text)
text = re.sub('(%s)' % re_vowel,
lambda match: vowels[ord(match.group(1))][0],
text)
text = re.sub('(%s)' % re_other,
lambda match: other[ord(match.group(1))][0],
text)
return text
def wikner(devanagari):
text = devanagari
text = re.sub('(%s)(%s)' % (re_consonant, re_vowel_sign),
lambda match: consonants[ord(match.group(1))][-1] + vowel_signs[ord(match.group(2))][-1],
text)
text = re.sub('(%s)(%s)' % (re_consonant, re_virama),
lambda match: consonants[ord(match.group(1))][-1],
text)
text = re.sub('(%s)' % re_consonant,
lambda match: consonants[ord(match.group(1))][-1] + re_a,
text)
text = re.sub('(%s)' % re_vowel,
lambda match: vowels[ord(match.group(1))][-1],
text)
text = re.sub('(%s)' % re_other,
lambda match: other[ord(match.group(1))][-1],
text)
return text
random_filename = 'lwfzal3XBeV8H10I8f4n'
# Added support of nukta from https://gist.github.com/ritwikmishra/9f8d6de45aff8fbe959d4260269d9eeb
nukta_dict = {
'क़': 'क़',
'ख़': 'ख़',
'ग़': 'ग़',
'ज़': 'ज़',
'ड़': 'ड़',
'ढ़': 'ढ़',
'फ़': 'फ़',
'य़': 'य़',
'ऱ': 'ऱ',
'ऴ': 'ऴ'
}
def get_preprocessed(filename, ext):
preprocessor = {
'dn': 'devnag',
'skt': './skt',
}
assert ext in preprocessor.keys(), ext
text = open(filename).read()
for k in nukta_dict.keys():
text = text.replace(k,nukta_dict[k])
transliterated = velthuis(text) if ext == 'dn' else wikner(text)
infile = '%s-%s.%s' % (random_filename, ext, ext)
open(infile, 'w').write(r'{\%s %s}' % (ext, transliterated))
p = subprocess.Popen([preprocessor[ext], infile],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=True)
out, err, ret = p.stdout.read(), p.stderr.read(), p.returncode
if err or ret:
print ('input: <%s>' % text.encode('utf-8'))
print ('transliterated: <%s>' % transliterated)
print ('stdout: <%s>' % out)
print ('stderr: <%s>' % err)
print ('returned: <%s>' % ret)
raise ValueError
outfile = '%s-%s.tex' % (random_filename, ext)
translation = open(outfile).read()
os.remove(outfile)
os.remove(infile)
prefix = r'\def\DevnagVersion{2.15}{\dn ' if ext == 'dn' else r'{\skt '
translation = translation[len(prefix):-1]
return translation
ext= 'dn' # Compiler format
filename= '../input/santshr/name.dn' # Source of the file
print(get_preprocessed(filename, ext)) # PdfLaTeX ready content
@rishi-raj-jain
Copy link
Author

rishi-raj-jain commented Oct 5, 2021

Don't forget to compile the devnag.c file from here before doing this. https://ctan.um.ac.ir/language/devanagari/velthuis/bin/devnag.c

Thanks for that ✨
With kaggle, you don't need to even run a compiler.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment