-
-
Save amerberg/a273ca1e579ab573b499 to your computer and use it in GitHub Desktop.
import ply.lex, argparse, io | |
#Usage | |
# python stripcomments.py input.tex > output.tex | |
# python stripcomments.py input.tex -e encoding > output.tex | |
#This utility is released under the WTFPL license: http://www.wtfpl.net/about/ | |
def strip_comments(source): | |
tokens = ( | |
'PERCENT', 'BEGINCOMMENT', 'ENDCOMMENT', 'BACKSLASH', | |
'CHAR', 'BEGINVERBATIM', 'ENDVERBATIM', 'NEWLINE', 'ESCPCT', | |
) | |
states = ( | |
('linecomment', 'exclusive'), | |
('commentenv', 'exclusive'), | |
('verbatim', 'exclusive') | |
) | |
#Deal with escaped backslashes, so we don't think they're escaping %. | |
def t_ANY_BACKSLASH(t): | |
r"\\\\" | |
return t | |
#One-line comments | |
def t_PERCENT(t): | |
r"\%" | |
t.lexer.begin("linecomment") | |
#Escaped percent signs | |
def t_ESCPCT(t): | |
r"\\\%" | |
return t | |
#Comment environment, as defined by verbatim package | |
def t_BEGINCOMMENT(t): | |
r"\\begin\s*{\s*comment\s*}" | |
t.lexer.begin("commentenv") | |
#Verbatim environment (different treatment of comments within) | |
def t_BEGINVERBATIM(t): | |
r"\\begin\s*{\s*verbatim\s*}" | |
t.lexer.begin("verbatim") | |
return t | |
#Any other character in initial state we leave alone | |
def t_CHAR(t): | |
r"." | |
return t | |
def t_NEWLINE(t): | |
r"\n" | |
return t | |
#End comment environment | |
def t_commentenv_ENDCOMMENT(t): | |
r"\\end\s*{\s*comment\s*}" | |
#Anything after \end{comment} on a line is ignored! | |
t.lexer.begin('linecomment') | |
#Ignore comments of comment environment | |
def t_commentenv_CHAR(t): | |
r"." | |
pass | |
def t_commentenv_NEWLINE(t): | |
r"\n" | |
pass | |
#End of verbatim environment | |
def t_verbatim_ENDVERBATIM(t): | |
r"\\end\s*{\s*verbatim\s*}" | |
t.lexer.begin('INITIAL') | |
return t | |
#Leave contents of verbatim environment alone | |
def t_verbatim_CHAR(t): | |
r"." | |
return t | |
def t_verbatim_NEWLINE(t): | |
r"\n" | |
return t | |
#End a % comment when we get to a new line | |
def t_linecomment_ENDCOMMENT(t): | |
r"\n" | |
t.lexer.begin("INITIAL") | |
#Newline at the end of a line comment is stripped. | |
#Ignore anything after a % on a line | |
def t_linecomment_CHAR(t): | |
r"." | |
pass | |
lexer = ply.lex.lex() | |
lexer.input(source) | |
return u"".join([tok.value for tok in lexer]) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('filename', help = 'the file to strip comments from') | |
parser.add_argument('--encoding', '-e', default='utf-8') | |
args = parser.parse_args() | |
with io.open(args.filename, encoding=args.encoding) as f: | |
source = f.read() | |
print(strip_comments(source)) | |
if __name__ == '__main__': | |
main() |
Console says:
WARNING: No t_error rule is defined
WARNING: No error rule is defined for exclusive state 'verbatim'
WARNING: No error rule is defined for exclusive state 'commentenv'
WARNING: No error rule is defined for exclusive state 'linecomment'
Traceback (most recent call last):
File "/Users/evlogii/Downloads/strip_comments.py", line 111, in
main()
File "/Users/evlogii/Downloads/strip_comments.py", line 108, in main
print(strip_comments(source))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 279-285: ordinal not in range(128)
my file is in utf-8 (contains some cyrillic symbols) and I do just python stripcomments.py input.tex > output.tex
any suggestions? =/
The %
at the end of 2nd line need to be preserved or the code can't run, how to solve that?
\makeatletter
\def\alloc@#1#2#3#4#5%
{\ifnum\count1#1<#4% make sure there's still room
\allocationnumber\count1#1
\global\advance\count1#1\@ne
\global#3#5\allocationnumber
\wlog{\string#5=\string#2\the\allocationnumber}%
\else\ifnum#1<6
\def\etex@dummy@definition{}% <-- code added
\begingroup \escapechar\m@ne
\expandafter\alloc@@\expandafter{\string#2}#5%
\else\errmessage{No room for a new #2}\fi\fi
}
\makeatother
UPDATE:
I've modified the snippet to solve my problem, along with @m3phisto's suggestion.
https://gist.github.com/dzhuang/dc34cdd7efa43e5ecc1dc981cc906c85
Thank you for the useful code.
For anyone stumbling over this in the future: latexpand
can reliably remove comments, too.
Wow, I completely forgot about this and didn't see all these comments. Thanks to everyone who has made improvements. I've added a comment to clarify the licensing situation.
To remove all the comments from a latex file, another option is to use use arxiv-latex-cleaner. Actively maintained, 1.2k GitHub stars, written in Python but no need to know Python.
I believe, the line "def t_ANY_BACKSLASH(t):" should be changed to "def t_BACKSLASH(t):". Otherwise, double backslashes appearing in a linecomment are written to the output. Apart from that, very useful and easily extensible. Thank you!