Last active
August 7, 2018 16:56
-
-
Save mcaceresb/374f5a3bea8d47948dc2829f8c7a0f4f to your computer and use it in GitHub Desktop.
Delete all comments from a Stata do file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Delete all comments from Stata file | |
WARNINGS | |
-------- | |
Does not parse `#delimit ;` | |
`/*/*` and similar constructs are not parsed correctly. Note that as of | |
Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block | |
and starts a new one, however. | |
Usage | |
----- | |
From CLI: | |
$ python deleteStataComments.py /path/to/file.do | |
From Python: | |
>>> from deleteStataComments import deleteComments | |
>>> doCode = deleteComments(open('/path/to/file.do', 'r').read()) | |
""" | |
from os import linesep | |
import regex | |
import sys | |
StataComment = { | |
'multiNested': regex.compile( | |
( | |
r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)' | |
r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)' | |
).format(linesep) | |
, | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'multiNestedEscape': regex.compile( | |
r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)' | |
r'|' | |
r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))' | |
, | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'inline': [ | |
regex.compile( | |
r'^(?<space>\s*)//[^/].*$', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'^(?<space>\s*)///[\s\S]*?^', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?^'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
], | |
'linestar': regex.compile( | |
r'^\s*\*[\s\S]*?(^|\Z)', | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
} | |
StataMata = regex.compile( | |
r"(?<stata>.*?)" | |
r"(?<mata>" | |
r"(^\s*" | |
r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*" | |
r"mata\s*:?\s*$" | |
r")" | |
r".*?(\s*end.*?$|\Z)|\Z" | |
r")" | |
, | |
flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE | |
) | |
def main(): | |
if len(sys.argv) > 1: | |
doFile = sys.argv[1] | |
with open(doFile, 'r') as doHandle: | |
print(deleteComments(doHandle.read())) | |
def deleteComments(doStr): | |
doStr = StataComment['multiNested'].sub( | |
'', | |
doStr | |
) | |
doStr = StataComment['multiNestedEscape'].sub( | |
deleteCStyle, | |
doStr | |
) | |
for regexp in StataComment['inline']: | |
doStr = regexp.sub( | |
'\g<space>', | |
doStr | |
) | |
doStr = StataMata.sub(deleteLineStar, doStr) | |
return doStr | |
def deleteCStyle(match): | |
if match.groupdict()['ignore']: | |
return match.groupdict()['ignore'] | |
elif match.groupdict()['delete']: | |
return '' | |
def deleteLineStar(match): | |
stata, mata = list(match.groupdict().values()) | |
rstr = "" | |
if stata: | |
rstr += StataComment['linestar'].sub('', stata) | |
if mata: | |
rstr += mata | |
return rstr | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment