Last active
June 6, 2021 20:04
-
-
Save amerberg/a273ca1e579ab573b499 to your computer and use it in GitHub Desktop.
A script to remove comments from LaTeX source
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ply.lex, argparse, io | |
#Usage | |
# python stripcomments.py input.tex > output.tex | |
# python stripcomments.py input.tex -e encoding > output.tex | |
#This utility is released under the WTFPL license: http://www.wtfpl.net/about/ | |
def strip_comments(source): | |
tokens = ( | |
'PERCENT', 'BEGINCOMMENT', 'ENDCOMMENT', 'BACKSLASH', | |
'CHAR', 'BEGINVERBATIM', 'ENDVERBATIM', 'NEWLINE', 'ESCPCT', | |
) | |
states = ( | |
('linecomment', 'exclusive'), | |
('commentenv', 'exclusive'), | |
('verbatim', 'exclusive') | |
) | |
#Deal with escaped backslashes, so we don't think they're escaping %. | |
def t_ANY_BACKSLASH(t): | |
r"\\\\" | |
return t | |
#One-line comments | |
def t_PERCENT(t): | |
r"\%" | |
t.lexer.begin("linecomment") | |
#Escaped percent signs | |
def t_ESCPCT(t): | |
r"\\\%" | |
return t | |
#Comment environment, as defined by verbatim package | |
def t_BEGINCOMMENT(t): | |
r"\\begin\s*{\s*comment\s*}" | |
t.lexer.begin("commentenv") | |
#Verbatim environment (different treatment of comments within) | |
def t_BEGINVERBATIM(t): | |
r"\\begin\s*{\s*verbatim\s*}" | |
t.lexer.begin("verbatim") | |
return t | |
#Any other character in initial state we leave alone | |
def t_CHAR(t): | |
r"." | |
return t | |
def t_NEWLINE(t): | |
r"\n" | |
return t | |
#End comment environment | |
def t_commentenv_ENDCOMMENT(t): | |
r"\\end\s*{\s*comment\s*}" | |
#Anything after \end{comment} on a line is ignored! | |
t.lexer.begin('linecomment') | |
#Ignore comments of comment environment | |
def t_commentenv_CHAR(t): | |
r"." | |
pass | |
def t_commentenv_NEWLINE(t): | |
r"\n" | |
pass | |
#End of verbatim environment | |
def t_verbatim_ENDVERBATIM(t): | |
r"\\end\s*{\s*verbatim\s*}" | |
t.lexer.begin('INITIAL') | |
return t | |
#Leave contents of verbatim environment alone | |
def t_verbatim_CHAR(t): | |
r"." | |
return t | |
def t_verbatim_NEWLINE(t): | |
r"\n" | |
return t | |
#End a % comment when we get to a new line | |
def t_linecomment_ENDCOMMENT(t): | |
r"\n" | |
t.lexer.begin("INITIAL") | |
#Newline at the end of a line comment is stripped. | |
#Ignore anything after a % on a line | |
def t_linecomment_CHAR(t): | |
r"." | |
pass | |
lexer = ply.lex.lex() | |
lexer.input(source) | |
return u"".join([tok.value for tok in lexer]) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('filename', help = 'the file to strip comments from') | |
parser.add_argument('--encoding', '-e', default='utf-8') | |
args = parser.parse_args() | |
with io.open(args.filename, encoding=args.encoding) as f: | |
source = f.read() | |
print(strip_comments(source)) | |
if __name__ == '__main__': | |
main() |
For anyone stumbling over this in the future: latexpand
can reliably remove comments, too.
Wow, I completely forgot about this and didn't see all these comments. Thanks to everyone who has made improvements. I've added a comment to clarify the licensing situation.
To remove all the comments from a latex file, another option is to use use arxiv-latex-cleaner. Actively maintained, 1.2k GitHub stars, written in Python but no need to know Python.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The
%
at the end of 2nd line need to be preserved or the code can't run, how to solve that?UPDATE:
I've modified the snippet to solve my problem, along with @m3phisto's suggestion.
https://gist.github.com/dzhuang/dc34cdd7efa43e5ecc1dc981cc906c85
Thank you for the useful code.