Skip to content

Instantly share code, notes, and snippets.

@informationsea
Created January 17, 2013 14:12
Show Gist options
  • Save informationsea/4556171 to your computer and use it in GitHub Desktop.
Save informationsea/4556171 to your computer and use it in GitHub Desktop.
Remove TeX control sequences from TeX source (for LanguageTool)
#!/usr/bin/env python
__author__ = 'Y.OKAMURA <okamura=AT=informationsea.info>'
__copyright__ = 'Copyright (C) 2013 Y.OKAMURA'
__license__ = 'GPL3+'
import argparse
import re
import sys
def _main():
parser = argparse.ArgumentParser(description="Remove TeX control sequences from TeX source")
parser.add_argument('input', type=argparse.FileType('r'), default=sys.stdin, nargs='?')
parser.add_argument('output', type=argparse.FileType('w'), default=sys.stdout, nargs='?')
options = parser.parse_args()
sections = re.compile(r'\\((sub)*section\*?|caption)\{([^\}]+)\}')
removes = re.compile(r'(\\[^\{\}\s\\]+\s|%.+|\s[\{\}]\s|\\[^\{]+([\{\[][^\{\}]+[\}\]])+|\$[^\$]+\$|[\{\}])')
begin = re.compile(r'\\(begin|end)\{([^\}]+)\}')
dontremove = ['center', 'itemize', 'document']
beginstack = list()
for line in options.input:
line = unicode(line, 'ISO-2022-JP')
pos = 0
while begin.search(line, pos):
m = begin.search(line, pos)
if m.group(1) == 'begin':
beginstack.append(m.group(2))
else:
beginstack.pop()
pos = m.end()
if any([(x not in dontremove) for x in beginstack]):
options.output.write('\n')
continue
pos = 0
while sections.search(line, pos):
m = sections.search(line, pos)
line = line[:m.start()] + u' '*(m.start(3)-m.start()) + m.group(3) + u' '*(m.end(3)-m.end()) + line[m.end():]
pos = m.end()-1
pos = 0
while removes.search(line, pos):
m = removes.search(line, pos)
line = line[:m.start()] + u' '*(m.end() - m.start()) + line[m.end():]
pos = m.end()-1
options.output.write(line.encode('utf-8'))
# if begin.search(s):
# print 'BEGIN: '+begin.search(s).group(1)
pass
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment