Last active
December 6, 2019 16:13
-
-
Save Syncrossus/46acf43baefaeba6eef1f346a71e0b95 to your computer and use it in GitHub Desktop.
Determines the proportion of python code that is comments. To use, type `python comment_proportions.py file1.py [file2.py] [file3.py] [...]` in your terminal.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pygments import highlight | |
from pygments.lexers import get_lexer_by_name | |
from pygments.formatters import BBCodeFormatter | |
import re | |
import sys | |
def get_comments(code): | |
""" Extracts comments and docstrings from python code. | |
Uses pygments to mark comments and docstrings and | |
then regular expressions to extract them. | |
Args: | |
- code (str) : the code to extract comments from | |
Return: | |
- comments (list<str>) : the comments and docstrings | |
extracted from the code | |
""" | |
comment_regex = re.compile( | |
r"(?:\[color=#408080\]\[i\])(.*)(?:\[/i\]\[/color\])") | |
docstring_regex = re.compile( | |
r"(?:\[color=#BA2121\]\[i\])(.*?)(?:\[/i\]\[/color\])", re.DOTALL) | |
lexer = get_lexer_by_name("python", stripall=True) | |
formatter = BBCodeFormatter(linenos=False) | |
result = highlight(code, lexer, formatter) | |
comments = comment_regex.findall(result) | |
docstrings = docstring_regex.findall(result) | |
return comments + docstrings | |
def compute_comment_stats(file_list): | |
""" Finds the total number of bytes, the number of bytes of comments, | |
and computes the ratio of the two, for each file and in total. | |
Args: | |
- file_list (list<str>): the list of files to compute stats for | |
Return: | |
- stats (dict): a dict with filenames as keys and dicts of | |
statistics as values. | |
""" | |
stats = {} | |
total_len = 0 | |
total_comment_len = 0 | |
for file in file_list: | |
with open(file, 'r') as f: | |
source_code = f.read() | |
total_len += len(source_code) | |
comments = get_comments(source_code) | |
comment_len = sum([len(comment) for comment in comments]) | |
total_comment_len += comment_len | |
stats[file] = { | |
"comment bytes": comment_len, | |
"total bytes": len(source_code), | |
"ratio": comment_len / len(source_code)} | |
stats["total"] = { | |
"comment bytes": total_comment_len, | |
"total bytes": total_len, | |
"ratio": total_comment_len / total_len} | |
return stats | |
if __name__ == '__main__': | |
stats = compute_comment_stats(sys.argv[1:]) | |
print(stats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you liked this, check out cloc, it's a fantastic counter for lines of code.