Skip to content

Instantly share code, notes, and snippets.

@ksindi
Last active April 23, 2018 22:22
Show Gist options
  • Save ksindi/2dd7fbfc0066aedc5c69b53e1b493da8 to your computer and use it in GitHub Desktop.
Save ksindi/2dd7fbfc0066aedc5c69b53e1b493da8 to your computer and use it in GitHub Desktop.
Top authors by number of lines committed
git ls-tree -rz --name-only HEAD -- . | xargs -0n1 git blame --line-porcelain HEAD -- | grep -aoP "(?<=^author ).*" | sort | uniq -c | sort -rnk1
@ksindi
Copy link
Author

ksindi commented Apr 22, 2018

import tempfile
from fnmatch import fnmatch

import pandas as pd
import seaborn
import git
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed
import numpy as np


repo_name = '/path/to/repo/'

repo = git.Repo(repo_name)
branch = repo.heads['master']
root = repo.tree(branch)

only = ['*']  # fnmatch
ignore = []


def iter_blobs(root, only, ignore):
    for blob in root.blobs:
        if (any(fnmatch(blob.path, pattern) for pattern in only)
            and not any(fnmatch(blob.path, pattern) for pattern in ignore)):
            yield blob
    for tree in root.trees:
        yield from iter_blobs(tree, only, ignore)


def populate_author_lines(path, branch, path2id, author2id, author_lines):
    for commit, lines in repo.blame(branch, path, line_porcelain=True):
        author = commit.author.name
        author_lines[path2id[path], author2id[author]] += len(lines)
    return None
            
            
authors = list(set(repo.git.log('--format=%aN').split('\n')))
paths = [blob.path for blob in iter_blobs(root, only, ignore)]

author2id = {name: i for i, name in enumerate(authors)}
path2id = {path: i for i, path in enumerate(paths)}

num_authors = len(author2id)
num_paths = len(path2id)

_author_lines = np.zeros((num_paths, num_authors), dtype='int')

tmp = tempfile.NamedTemporaryFile(delete=False)
print("author_lines path: ", tmp.name)
author_lines = np.memmap(tmp, dtype='int', shape=_author_lines.shape, mode='w+')

_ = Parallel(n_jobs=-1)(delayed(populate_author_lines)(blob.path, branch, path2id, author2id, author_lines)
                        for blob in iter_blobs(root, only, ignore))

%matplotlib inline

author2lines = list(zip(authors, author_lines.sum(axis=0) / author_lines.sum()))

df = pd.DataFrame(author2lines, columns=['author', 'line_share'])
df.set_index('author').plot.barh(title="Commit Line Market Share", figsize=(10, 10))

@ksindi
Copy link
Author

ksindi commented Apr 23, 2018

Hackier version: git ls-tree -r -z --name-only HEAD -- . | xargs -0 -n1 -I % sh -c 'git blame --line-porcelain HEAD -- % |grep -aoP "(?<=^author ).*"|sort|uniq -c;' | awk 'BEGIN{}{a[$2]+=$1}END{ for (i in a) print a[i],i}' | sort -k 1 -n

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment