nhoffman · May 31, 2021 03:50
diff --git a/git_stats.py b/git_stats.py
 #!/usr/bin/env python3

 """Describe author contributions for one or more git repositories by date

 Output is a csv with columns (repo, author, timestamp, churn) where
 'churn' is the sum of lines added and lines removed.

 Requires pandas and plotnine

 """

 import sys
 import argparse
 import subprocess
 from pathlib import Path
 import re
 import csv

 import pandas as pd
 from plotnine import (ggplot, geom_bar, aes, facet_wrap, theme,
                      element_text, scale_y_log10)
 import plotnine as p9


 def parse_log(text, exclude=None):
    repo, author, ts, churn = None, None, None, 0

    if exclude:
        exclude = re.compile(exclude)

    for line in text.splitlines():
        if line.startswith('#'):
            if repo:
                yield (repo, author, ts, churn)
            (repo, author, ts), churn = line[1:].split('|'), 0
        elif line.strip():
            gain, loss, filename = line.split(None, 2)
            if exclude and exclude.search(filename):
                continue
            try:
                churn += int(gain) + int(loss)
            except ValueError:
                # eg, binary files report lines changes using '-'
                churn += 1

    yield (repo, author, ts, churn)


 def main(arguments):

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('repos', help="Path to one or more repositories", nargs='+')
    parser.add_argument('-x', '--exclude', help='regular expression excluding filenames')
    parser.add_argument('-a', '--author-map', type=argparse.FileType(),
                        help="""headerless csv file with columns
                        (label, author-email) providing instructions
                        for consolidating or (if the first column is
                        left empty) excluding authors. """)
    parser.add_argument('-o', '--outfile', help="CSV output file")
    parser.add_argument('-p', '--plotfile', help="plot output file")

    args = parser.parse_args(arguments)

    if args.author_map:
        author_map = {v: k for k, v in csv.reader(args.author_map) if k}

    data = []
    for path in args.repos:
        pth = Path(path)
        repo = pth.name
        cmd = ['git', '-C', path, '--no-pager', 'log',
               '--numstat', 'master', f'--format=format:#{repo}|%ce|%ci']

        job = subprocess.run(cmd, capture_output=True, text=True)
        data += list(parse_log(job.stdout, exclude=args.exclude))

    df = pd.DataFrame(data, columns=['repo', 'author', 'timestamp', 'churn'])
    if args.outfile:
        df.to_csv(args.outfile, index=False)

    if author_map:
        df = df[df['author'].isin(author_map)]
        df['author'] = df['author'].apply(lambda a: author_map[a])

    df['date'] = pd.to_datetime(df['timestamp']).apply(lambda ts: ts.strftime('%Y-%m'))
    df.drop(['timestamp'], axis=1, inplace=True)
    grouped = df.groupby(['repo', 'author', 'date'])
    tab = grouped.sum().reset_index()

    plt = (ggplot(tab, aes('date', 'churn', fill='author')) +
           geom_bar(position='stack', stat='identity') +
           facet_wrap('~repo', ncol=1) +
           scale_y_log10() +
           p9.theme_538() +
           theme(
               axis_text_x=element_text(rotation=90),
               axis_text_y=element_text(size=0),
               axis_title_y=element_text(size=0),
               legend_position='top'))

    if args.plofile:
        plt.save(args.plotfile)


 if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python3

	"""Describe author contributions for one or more git repositories by date

	Output is a csv with columns (repo, author, timestamp, churn) where
	'churn' is the sum of lines added and lines removed.

	Requires pandas and plotnine

	"""

	import sys
	import argparse
	import subprocess
	from pathlib import Path
	import re
	import csv

	import pandas as pd
	from plotnine import (ggplot, geom_bar, aes, facet_wrap, theme,
	element_text, scale_y_log10)
	import plotnine as p9


	def parse_log(text, exclude=None):
	repo, author, ts, churn = None, None, None, 0

	if exclude:
	exclude = re.compile(exclude)

	for line in text.splitlines():
	if line.startswith('#'):
	if repo:
	yield (repo, author, ts, churn)
	(repo, author, ts), churn = line[1:].split('\|'), 0
	elif line.strip():
	gain, loss, filename = line.split(None, 2)
	if exclude and exclude.search(filename):
	continue
	try:
	churn += int(gain) + int(loss)
	except ValueError:
	# eg, binary files report lines changes using '-'
	churn += 1

	yield (repo, author, ts, churn)


	def main(arguments):

	parser = argparse.ArgumentParser(
	description=__doc__,
	formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.add_argument('repos', help="Path to one or more repositories", nargs='+')
	parser.add_argument('-x', '--exclude', help='regular expression excluding filenames')
	parser.add_argument('-a', '--author-map', type=argparse.FileType(),
	help="""headerless csv file with columns
	(label, author-email) providing instructions
	for consolidating or (if the first column is
	left empty) excluding authors. """)
	parser.add_argument('-o', '--outfile', help="CSV output file")
	parser.add_argument('-p', '--plotfile', help="plot output file")

	args = parser.parse_args(arguments)

	if args.author_map:
	author_map = {v: k for k, v in csv.reader(args.author_map) if k}

	data = []
	for path in args.repos:
	pth = Path(path)
	repo = pth.name
	cmd = ['git', '-C', path, '--no-pager', 'log',
	'--numstat', 'master', f'--format=format:#{repo}\|%ce\|%ci']

	job = subprocess.run(cmd, capture_output=True, text=True)
	data += list(parse_log(job.stdout, exclude=args.exclude))

	df = pd.DataFrame(data, columns=['repo', 'author', 'timestamp', 'churn'])
	if args.outfile:
	df.to_csv(args.outfile, index=False)

	if author_map:
	df = df[df['author'].isin(author_map)]
	df['author'] = df['author'].apply(lambda a: author_map[a])

	df['date'] = pd.to_datetime(df['timestamp']).apply(lambda ts: ts.strftime('%Y-%m'))
	df.drop(['timestamp'], axis=1, inplace=True)
	grouped = df.groupby(['repo', 'author', 'date'])
	tab = grouped.sum().reset_index()

	plt = (ggplot(tab, aes('date', 'churn', fill='author')) +
	geom_bar(position='stack', stat='identity') +
	facet_wrap('~repo', ncol=1) +
	scale_y_log10() +
	p9.theme_538() +
	theme(
	axis_text_x=element_text(rotation=90),
	axis_text_y=element_text(size=0),
	axis_title_y=element_text(size=0),
	legend_position='top'))

	if args.plofile:
	plt.save(args.plotfile)


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))
No results found