mcaceresb · August 7, 2018 16:56
diff --git a/deleteStataComments.py b/deleteStataComments.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 Delete all comments from Stata file

 WARNINGS
 --------

 Does not parse `#delimit ;`

 `/*/*` and similar constructs are not parsed correctly. Note that as of
 Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block
 and starts a new one, however.

 Usage
 -----

 From CLI:
 $ python deleteStataComments.py /path/to/file.do

 From Python:
 >>> from deleteStataComments import deleteComments
 >>> doCode = deleteComments(open('/path/to/file.do', 'r').read())
 """

 from os import linesep
 import regex
 import sys

 StataComment = {
    'multiNested': regex.compile(
        (
        r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)'
        r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)'
        ).format(linesep)
        ,
        flags = regex.VERBOSE + regex.MULTILINE
    ),
    'multiNestedEscape': regex.compile(
        r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)'
        r'|'
        r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))'
        ,
        flags = regex.VERBOSE + regex.MULTILINE
    ),
    'inline': [
        regex.compile(
            r'^(?<space>\s*)//[^/].*$',
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'^(?<space>\s*)///[\s\S]*?^',
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep),
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?^'.format(linesep),
            flags = regex.VERBOSE + regex.MULTILINE
        )
    ],
    'linestar': regex.compile(
        r'^\s*\*[\s\S]*?(^|\Z)',
        flags = regex.VERBOSE + regex.MULTILINE
    )
 }

 StataMata = regex.compile(
    r"(?<stata>.*?)"
    r"(?<mata>"
    r"(^\s*"
    r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*"
    r"mata\s*:?\s*$"
    r")"
    r".*?(\s*end.*?$|\Z)|\Z"
    r")"
    ,
    flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE
 )


 def main():
    if len(sys.argv) > 1:
        doFile = sys.argv[1]
        with open(doFile, 'r') as doHandle:
            print(deleteComments(doHandle.read()))


 def deleteComments(doStr):

    doStr = StataComment['multiNested'].sub(
        '',
        doStr
    )

    doStr = StataComment['multiNestedEscape'].sub(
        deleteCStyle,
        doStr
    )

    for regexp in StataComment['inline']:
        doStr = regexp.sub(
            '\g<space>',
            doStr
        )

    doStr = StataMata.sub(deleteLineStar, doStr)

    return doStr


 def deleteCStyle(match):
    if match.groupdict()['ignore']:
        return match.groupdict()['ignore']
    elif match.groupdict()['delete']:
        return ''


 def deleteLineStar(match):
    stata, mata = list(match.groupdict().values())
    rstr = ""
    if stata:
        rstr += StataComment['linestar'].sub('', stata)

    if mata:
        rstr += mata

    return rstr


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Delete all comments from Stata file

	WARNINGS
	--------

	Does not parse `#delimit ;`

	`//` and similar constructs are not parsed correctly. Note that as of
	Stata 14, the parsing behavior of // changed. `/*` still ends a block
	and starts a new one, however.

	Usage
	-----

	From CLI:
	$ python deleteStataComments.py /path/to/file.do

	From Python:
	>>> from deleteStataComments import deleteComments
	>>> doCode = deleteComments(open('/path/to/file.do', 'r').read())
	"""

	from os import linesep
	import regex
	import sys

	StataComment = {
	'multiNested': regex.compile(
	(
	r'(?<!^\s//.\|(?!\B"[^"])\s+//+(?![^"{0}]"\B).?\|".?)'
	r'/\(?:(?!/\\|\/)[\s\S]\|(?R))(\*/\|\Z)'
	).format(linesep)
	,
	flags = regex.VERBOSE + regex.MULTILINE
	),
	'multiNestedEscape': regex.compile(
	r'(?<ignore>\s//.\|(?!\B"[^"])\s+//(?![^"]"\B).?\|".?".*?)'
	r'\|'
	r'(?<delete>/\(?:(?!/\\|\/)[\s\S]\|(?R))(\*/\|\Z))'
	,
	flags = regex.VERBOSE + regex.MULTILINE
	),
	'inline': [
	regex.compile(
	r'^(?<space>\s)//[^/].$',
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'^(?<space>\s)///[\s\S]?^',
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'(?!\B"[^"])(?<space>\s+)//($\|[^/])(?![^"{0}]"\B).*?$'.format(linesep),
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'(?!\B"[^"])(?<space>\s+)///(?![^"{0}]"\B)[\s\S]*?^'.format(linesep),
	flags = regex.VERBOSE + regex.MULTILINE
	)
	],
	'linestar': regex.compile(
	r'^\s\[\s\S]*?(^\|\Z)',
	flags = regex.VERBOSE + regex.MULTILINE
	)
	}

	StataMata = regex.compile(
	r"(?<stata>.*?)"
	r"(?<mata>"
	r"(^\s*"
	r"(\s(cap(t(u(re?)?)?)?\|n(o(i(s(i(ly?)?)?)?)?)?\|qui(e(t(ly?)?)?)?)(\s+:?\|:?\s+))"
	r"mata\s:?\s$"
	r")"
	r".?(\send.*?$\|\Z)\|\Z"
	r")"
	,
	flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE
	)


	def main():
	if len(sys.argv) > 1:
	doFile = sys.argv[1]
	with open(doFile, 'r') as doHandle:
	print(deleteComments(doHandle.read()))


	def deleteComments(doStr):

	doStr = StataComment['multiNested'].sub(
	'',
	doStr
	)

	doStr = StataComment['multiNestedEscape'].sub(
	deleteCStyle,
	doStr
	)

	for regexp in StataComment['inline']:
	doStr = regexp.sub(
	'\g<space>',
	doStr
	)

	doStr = StataMata.sub(deleteLineStar, doStr)

	return doStr


	def deleteCStyle(match):
	if match.groupdict()['ignore']:
	return match.groupdict()['ignore']
	elif match.groupdict()['delete']:
	return ''


	def deleteLineStar(match):
	stata, mata = list(match.groupdict().values())
	rstr = ""
	if stata:
	rstr += StataComment['linestar'].sub('', stata)

	if mata:
	rstr += mata

	return rstr


	if __name__ == "__main__":
	main()