Skip to content

Instantly share code, notes, and snippets.

@BroHui
Created September 13, 2017 03:31
Show Gist options
  • Save BroHui/aca2b8e6e6bdf3cb4af4b246c9837fa3 to your computer and use it in GitHub Desktop.
Save BroHui/aca2b8e6e6bdf3cb4af4b246c9837fa3 to your computer and use it in GitHub Desktop.
Remove comments and docstrings from a python fille.
""" Strip comments and docstrings from a file.
"""
import sys, token, tokenize
def do_file(fname):
""" Run on just one file.
"""
source = open(fname)
mod = open(fname + ",strip", "w")
prev_toktype = token.INDENT
first_line = None
last_lineno = -1
last_col = 0
tokgen = tokenize.generate_tokens(source.readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if 0: # Change to if 1 to see the tokens fly by.
print("%10s %-14s %-20r %r" % (
tokenize.tok_name.get(toktype, toktype),
"%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
ttext, ltext
))
if slineno > last_lineno:
last_col = 0
if scol > last_col:
mod.write(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
mod.write("#--")
elif toktype == tokenize.COMMENT:
# Comment
mod.write("##\n")
else:
mod.write(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno
if __name__ == '__main__':
do_file(sys.argv[1])
@kb3dow
Copy link

kb3dow commented Nov 2, 2020

There is a bug with the code (having to do with the logic prev_toktype == token.INDENT)
If there is a docstring (1 line or multiline) that begins at column 1 (with no preceeding spaces/tabs), it is not stripped out.

So an input of the form


""" string 1 """
    """ string 2 """

In this case string 1 is not stripped out

@kb3dow
Copy link

kb3dow commented Nov 2, 2020

changing

    if toktype == token.STRING and prev_toktype == token.INDENT:

to

    if toktype == token.STRING and (prev_toktype == token.INDENT or prev_toktype == token.NEWLINE):

does the job.

@newdive
Copy link

newdive commented Dec 11, 2020

this will not generate legal code if a method has nothing but a doc string
you can check python/lib/codecs.py for example

@thread13
Copy link

credits: Ned Batchelder
https://stackoverflow.com/a/1769577/558008

check also the comments to his answer

@binary-husky
Copy link

This code will modify line breaks, space and black splash (), so I improve it further:

import token
import tokenize
import copy
import io

def remove_python_comments(input_source: str) -> str:
    source_flag = copy.copy(input_source)
    source = io.StringIO(input_source)
    ls = input_source.split('\n')
    prev_toktype = token.INDENT
    readline = source.readline

    def get_char_index(lineno, col):
        # find the index of the char in the source code
        if lineno == 1:
            return len('\n'.join(ls[:(lineno-1)])) + col
        else:
            return len('\n'.join(ls[:(lineno-1)])) + col + 1

    def replace_char_between(start_lineno, start_col, end_lineno, end_col, source, replace_char, ls):
        # replace char between start_lineno, start_col and end_lineno, end_col with replace_char, but keep '\n' and ' '
        b = get_char_index(start_lineno, start_col)
        e = get_char_index(end_lineno, end_col)
        for i in range(b, e):
            if source[i] == '\n':
                source = source[:i] + '\n' + source[i+1:]
            elif source[i] == ' ':
                source = source[:i] + ' ' + source[i+1:]
            else:
                source = source[:i] + replace_char + source[i+1:]
        return source

    tokgen = tokenize.generate_tokens(readline)
    for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
        if toktype == token.STRING and prev_toktype == token.INDENT:
            # if toktype == token.STRING and (prev_toktype == token.INDENT or prev_toktype == token.NEWLINE): this may solve some corner case, but I never test it
            source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
        elif toktype == tokenize.COMMENT:
            source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
        prev_toktype = toktype
    return source_flag

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment