ksindi/author_share.sh

ksindi · 2018-04-22T23:37:46Z

import tempfile
from fnmatch import fnmatch

import pandas as pd
import seaborn
import git
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed
import numpy as np


repo_name = '/path/to/repo/'

repo = git.Repo(repo_name)
branch = repo.heads['master']
root = repo.tree(branch)

only = ['*']  # fnmatch
ignore = []


def iter_blobs(root, only, ignore):
    for blob in root.blobs:
        if (any(fnmatch(blob.path, pattern) for pattern in only)
            and not any(fnmatch(blob.path, pattern) for pattern in ignore)):
            yield blob
    for tree in root.trees:
        yield from iter_blobs(tree, only, ignore)


def populate_author_lines(path, branch, path2id, author2id, author_lines):
    for commit, lines in repo.blame(branch, path, line_porcelain=True):
        author = commit.author.name
        author_lines[path2id[path], author2id[author]] += len(lines)
    return None
            
            
authors = list(set(repo.git.log('--format=%aN').split('\n')))
paths = [blob.path for blob in iter_blobs(root, only, ignore)]

author2id = {name: i for i, name in enumerate(authors)}
path2id = {path: i for i, path in enumerate(paths)}

num_authors = len(author2id)
num_paths = len(path2id)

_author_lines = np.zeros((num_paths, num_authors), dtype='int')

tmp = tempfile.NamedTemporaryFile(delete=False)
print("author_lines path: ", tmp.name)
author_lines = np.memmap(tmp, dtype='int', shape=_author_lines.shape, mode='w+')

_ = Parallel(n_jobs=-1)(delayed(populate_author_lines)(blob.path, branch, path2id, author2id, author_lines)
                        for blob in iter_blobs(root, only, ignore))

%matplotlib inline

author2lines = list(zip(authors, author_lines.sum(axis=0) / author_lines.sum()))

df = pd.DataFrame(author2lines, columns=['author', 'line_share'])
df.set_index('author').plot.barh(title="Commit Line Market Share", figsize=(10, 10))

ksindi · 2018-04-23T22:22:13Z

Hackier version: git ls-tree -r -z --name-only HEAD -- . | xargs -0 -n1 -I % sh -c 'git blame --line-porcelain HEAD -- % |grep -aoP "(?<=^author ).*"|sort|uniq -c;' | awk 'BEGIN{}{a[$2]+=$1}END{ for (i in a) print a[i],i}' | sort -k 1 -n

ksindi/author_share.sh

ksindi commented Apr 22, 2018

Uh oh!

ksindi commented Apr 23, 2018

Uh oh!