-
-
Save kylebarron/e46ce79911976c3aae9d7e98309b9222 to your computer and use it in GitHub Desktop.
Delete all comments from a Stata do file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# TODO: Comment code; include regexp? explanations | |
# TODO: parse with delimit in separate file; note this is imperfect. It | |
# has to be because of the way it works, which is super messy (specially | |
# for multi-line strings; i.e. stuff in quotes spanning many lines). | |
# TODO: scan code for `/*/`, `*/*`, and similar constructs. 'Please open | |
# and close all comment blocks explicitly' | |
# TODO: get rid of special case for locals? | |
""" | |
Delete all comments from Stata file | |
WARNINGS | |
-------- | |
Does not parse `#delimit ;` | |
`/*/*` and similar constructs are not parsed correctly. Note that as of | |
Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block | |
and starts a new one, however. | |
Usage | |
----- | |
From CLI: | |
$ python stataparse/comments.py /path/to/file.do | |
From Python: | |
>>> from StataComments import deleteStataComments | |
>>> doCode = deleteStataComments(open('/path/to/file.do', 'r').read()) | |
""" | |
from os import linesep | |
import regex | |
StataComment = { | |
'multiNested': regex.compile( | |
( | |
r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)' | |
r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)' | |
).format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'multiNestedEscape': regex.compile( | |
r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)' | |
r'|' | |
r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'inline': [ | |
regex.compile( | |
r'^(?<space>\s*)//[^/].*$', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'^(?<space>\s*)///[\s\S]*?(^|\Z)', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?(^|\Z)'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
], | |
'linestar': regex.compile( | |
r'^\s*\*[\s\S]*?(^|\Z)', | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
} | |
StataMata = regex.compile( | |
r"(?<stata>.*?)" | |
r"(?<mata>" | |
r"(^\s*" | |
r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*" | |
r"mata\s*:?\s*$" | |
r")" | |
r".*?(\s*end.*?$|\Z)|\Z" | |
r")", | |
flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE | |
) | |
def deleteStataComments(doStr): | |
doStr = StataComment['multiNested'].sub( | |
'', | |
doStr | |
) | |
doStr = StataComment['multiNestedEscape'].sub( | |
deleteCStyle, | |
doStr | |
) | |
for regexp in StataComment['inline']: | |
doStr = regexp.sub( | |
'\g<space>', | |
doStr | |
) | |
doStr = StataMata.sub(deleteLineStar, doStr) | |
return doStr | |
def deleteCStyle(match): | |
if match.groupdict()['ignore']: | |
return match.groupdict()['ignore'] | |
elif match.groupdict()['delete']: | |
return '' | |
def deleteLineStar(match): | |
stata, mata = list(match.groupdict().values()) | |
rstr = "" | |
if stata: | |
rstr += StataComment['linestar'].sub('', stata) | |
if mata: | |
rstr += mata | |
return rstr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment