rldotai · September 12, 2022 19:24 · Sep 12, 2022
diff --git a/get_comments.py b/get_comments.py
@@ -0,0 +1,51 @@
+import io, tokenize
+
+
+def extract_comments(code: str | io.TextIOBase) -> str:
+    """
+    Extract comments from a piece of Python code, returning a string of
+    *just* the comments.
+
+    Example:
+        >>> extract_comments(r'''
+        ... # A comment
+        ... def identity(x):
+        ...     "This is a docstring, not a comment."
+        ...      # Here's a comment inside a function
+        ...     return x # and an inline comment
+        ...
+        ... ''')
+        "# A comment\n# Here's a comment inside a function\n# and an inline comment\n"
+
+    A modified version of: https://stackoverflow.com/a/34512388
+    set to use Python 3.
+    """
+    res = []
+    last = None
+    if isinstance(code, str):
+        buffer = io.StringIO(code)
+    else:
+        buffer = code
+    # pass in stringio.readline to generate_tokens
+    for toktype, tokval, begin, end, line in tokenize.generate_tokens(buffer.readline):
+        if toktype == tokenize.COMMENT:
+            res.append((toktype, tokval))
+        elif toktype in (tokenize.NEWLINE, tokenize.NL) and last == tokenize.COMMENT:
+            res.append((toktype, tokval))
+        else:
+            pass
+
+        # Record the token type (for preserving newlines)
+        last = toktype
+    return tokenize.untokenize(res)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input", type=argparse.FileType("r"), help="Input to extract comments from"
+    )
+    args = parser.parse_args()
+    print(extract_comments(args.input.read()))