haldean · October 21, 2015 21:45
diff --git a/authordensity.py b/authordensity.py
 """
 Most useful as:

    git ls-tree --name-only -r HEAD | xargs python /path/to/authordensity.py
    
 When run from the root of your git repository. If people show up under
 multiple names, use the synonyms dict to map their aliases to a canonical
 name.

 Needs no external libs.
 """

 from __future__ import division, print_function

 import collections
 import re
 import subprocess
 import sys

 synonyms = {
 }

 match_files = re.compile(r"\.(cc|h|cpp|hpp|c|py|pxi|pyx)$")
 author_line_re = re.compile("^author ")
 count = 30

 file_densities = dict()
 authors_found = set()
 total_freq = collections.defaultdict(lambda: 0)

 for f in sys.argv[1:]:
    if not match_files.search(f):
        continue
    try:
        blame = subprocess.check_output(
            ["git", "blame", "--line-porcelain", f], stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        if e.returncode == 128:
            continue
        raise
    blame_lines = blame.splitlines()
    author_lines = filter(lambda l: author_line_re.match(l), blame_lines)
    authors_freq = collections.defaultdict(lambda: 0)
    for line in author_lines:
        author = line.split(" ", 1)[1]
        if author in synonyms:
            author = synonyms[author]
        authors_found.add(author)
        authors_freq[author] += 1
        total_freq[author] += 1
    total_lines = len(author_lines)
    authors = {author: author_lines / total_lines
               for author, author_lines in authors_freq.iteritems()}
    file_densities[f] = authors

 for author in authors_found:
    author_densities = [(file_density.get(author, 0), fname)
                        for fname, file_density in file_densities.iteritems()
                        if file_density.get(author)]
    author_densities.sort(reverse=True)
    print("\n%s" % author)
    print("\n".join("%3.0f%%\t%s" % (100. * s[0], s[1])
                    for s in author_densities[:count]))
    print()

 total_lines = sum(total_freq.values())
 records = total_freq.items()
 records.sort(key=lambda p: (p[1], p[0]), reverse=True)
 for author, freq in records:
    print("%s\t%.3f%%\t\t%s" % (freq, 100. * freq / total_lines, author))
	"""
	Most useful as:

	git ls-tree --name-only -r HEAD \| xargs python /path/to/authordensity.py

	When run from the root of your git repository. If people show up under
	multiple names, use the synonyms dict to map their aliases to a canonical
	name.

	Needs no external libs.
	"""

	from __future__ import division, print_function

	import collections
	import re
	import subprocess
	import sys

	synonyms = {
	}

	match_files = re.compile(r"\.(cc\|h\|cpp\|hpp\|c\|py\|pxi\|pyx)$")
	author_line_re = re.compile("^author ")
	count = 30

	file_densities = dict()
	authors_found = set()
	total_freq = collections.defaultdict(lambda: 0)

	for f in sys.argv[1:]:
	if not match_files.search(f):
	continue
	try:
	blame = subprocess.check_output(
	["git", "blame", "--line-porcelain", f], stderr=subprocess.PIPE)
	except subprocess.CalledProcessError as e:
	if e.returncode == 128:
	continue
	raise
	blame_lines = blame.splitlines()
	author_lines = filter(lambda l: author_line_re.match(l), blame_lines)
	authors_freq = collections.defaultdict(lambda: 0)
	for line in author_lines:
	author = line.split(" ", 1)[1]
	if author in synonyms:
	author = synonyms[author]
	authors_found.add(author)
	authors_freq[author] += 1
	total_freq[author] += 1
	total_lines = len(author_lines)
	authors = {author: author_lines / total_lines
	for author, author_lines in authors_freq.iteritems()}
	file_densities[f] = authors

	for author in authors_found:
	author_densities = [(file_density.get(author, 0), fname)
	for fname, file_density in file_densities.iteritems()
	if file_density.get(author)]
	author_densities.sort(reverse=True)
	print("\n%s" % author)
	print("\n".join("%3.0f%%\t%s" % (100. * s[0], s[1])
	for s in author_densities[:count]))
	print()

	total_lines = sum(total_freq.values())
	records = total_freq.items()
	records.sort(key=lambda p: (p[1], p[0]), reverse=True)
	for author, freq in records:
	print("%s\t%.3f%%\t\t%s" % (freq, 100. * freq / total_lines, author))