Created
August 31, 2012 12:44
-
-
Save ptigas/3552263 to your computer and use it in GitHub Desktop.
latex escape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
def latex_decode( s ) : | |
''' | |
A simple function which taks a latex escaping command | |
and returns the appropriate unicode character according | |
to this table (via wikipedia): | |
LaTeX command Sample Description | |
\`{o} ò grave accent | |
\'{o} ó acute accent | |
\^{o} ô circumflex | |
\"{o} ö umlaut or dieresis | |
\H{o} ő long Hungarian umlaut (double acute) | |
\~{o} õ tilde | |
\c{c} ç cedilla | |
\k{a} ą ogonek | |
\l ł l with stroke | |
\={o} ō macron accent (a bar over the letter) | |
\b{o} o bar under the letter | |
\.{o} ȯ dot over the letter | |
\d{u} ụ dot under the letter | |
\r{a} å ring over the letter (for å there is also the special command \aa) | |
\u{o} ŏ breve over the letter | |
\v{s} š caron/hacek ("v") over the letter | |
\t{oo} o͡o "tie" (inverted u) over the two letters | |
''' | |
import re | |
grave_accent = re.compile(r"\\\`\{([a-zA-Z])\}") | |
acute_accent = re.compile(r"\\\'\{([a-zA-Z])\}") | |
circumflex = re.compile(r"\\\^\{([a-zA-Z])\}") | |
umlaut = re.compile(r"\\\"\{([a-zA-Z])\}") | |
long_umlaut = re.compile(r"\\\H\{([a-zA-Z])\}") | |
tilde = re.compile(r"\\~\{([a-zA-Z])\}") | |
cedilla = re.compile(r"\\c\{([a-zA-Z])\}") | |
ogonek = re.compile(r"\\\k\{([a-zA-Z])\}") | |
macron_accent = re.compile(r"\\\=\{([a-zA-Z])\}") | |
bar_under_letter = re.compile(r"\\b\{([a-zA-Z])\}") | |
dot_over_letter = re.compile(r"\\\.\{([a-zA-Z])\}") | |
dot_under_letter = re.compile(r"\\d\{([a-zA-Z])\}") | |
ring_over_letter = re.compile(r"\\r\{([a-zA-Z])\}") | |
breve_over_letter = re.compile(r"\\u\{([a-zA-Z])\}") | |
caron_over_letter = re.compile(r"\\v\{([a-zA-Z])\}") | |
tie_over_letters = re.compile(r"\\t\{([a-zA-Z])([a-zA-Z])\}") | |
s = grave_accent.sub( lambda m: m.group(1) + u'\u0300', s ) | |
s = acute_accent.sub( lambda m: m.group(1) + u'\u0301', s ) | |
s = circumflex.sub( lambda m: m.group(1) + u'\u0311', s ) | |
s = umlaut.sub( lambda m: m.group(1) + u'\u0308', s ) | |
s = long_umlaut.sub( lambda m: m.group(1) + u'\u030b', s ) | |
s = tilde.sub( lambda m: m.group(1) + u'\u0303', s ) | |
s = cedilla.sub( lambda m: m.group(1) + u'\u0327', s ) | |
s = ogonek.sub( lambda m: m.group(1) + u'\u0328', s ) | |
s = macron_accent.sub( lambda m: m.group(1) + u'\u0328', s ) | |
s = bar_under_letter.sub( lambda m: m.group(1) + u'\u0331', s ) | |
s = dot_over_letter.sub( lambda m: m.group(1) + u'\u0307', s ) | |
s = dot_under_letter.sub( lambda m: m.group(1) + u'\u0323', s ) | |
s = ring_over_letter.sub( lambda m: m.group(1) + u'\u030a', s ) | |
s = breve_over_letter.sub( lambda m: m.group(1) + u'\u0306', s ) | |
s = caron_over_letter.sub( lambda m: m.group(1) + u'\u030c', s ) | |
s = tie_over_letters.sub( lambda m: m.group(1) + u'\u0361' + m.group(2) , s ) | |
return s | |
if __name__ == '__main__' : | |
tests = [ | |
( "\`{o}", 'ò' ), | |
( "\\\'{o}", 'ó' ), | |
( "\^{o}", 'ô' ), | |
( "\\\"{o}", 'ö'), | |
( "\H{o}", 'ő'), | |
( "\~{o}", 'õ'), | |
( "\c{c}", 'ç'), | |
( "\k{a}", 'ą'), | |
( "\={o}", 'ō'), | |
( "\\b{o}", 'o'), | |
( "\.{u}", 'ụ'), | |
( "\d{u}", 'u'), | |
( "\\r{a}", 'å'), | |
( "\u{o}", 'ŏ'), | |
( "\\v{s}", 'š'), | |
( "\\t{oo}", 'o͡o'), | |
] | |
for t in tests : | |
print t[1], latex_decode(t[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment