Created
June 19, 2020 13:55
-
-
Save gabyx/2efe9d2bb7c0931a92d4be9406dcd9ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
import argparse | |
import tempfile | |
import operator | |
from functools import partial, reduce | |
import subprocess | |
from tqdm import tqdm | |
from binaryornot.check import is_binary | |
import sys | |
import signal | |
import subprocess | |
import multiprocessing | |
import psutil | |
import commentjson as jsonComment | |
import json as json | |
import requests | |
def check(cmd, stderr=None, stdout=None, **kwargs): | |
""" Check a call: | |
Note: check(r'git check-attr --cached --all -- "{0}" | grep -q "filter: lfs"'.format(blob.path), shell=True) | |
Quotes '{0}' would not work. | |
""" | |
try: | |
subprocess.check_call(cmd, stdout=stdout, stderr=stderr, **kwargs) | |
return True | |
except subprocess.CalledProcessError: | |
return False | |
def get(cmd, raw=False, stderr=None, pipefail=True, **kwargs): | |
if kwargs.get("shell", False) and pipefail: | |
# We launch a pipe cmd and want "set -o pipefail" | |
# wrap with bash | |
assert isinstance(cmd, str), "{0} is not a string".format(cmd) | |
cmd = ['bash', '-o', 'pipefail', '-c', cmd] | |
kwargs["shell"] = False | |
out = subprocess.check_output(cmd, stderr=stderr, **kwargs) | |
if raw: | |
return out | |
else: | |
return out.strip().split('\n') if out else [] | |
def log(*args, **kwargs): | |
print(*args, file=sys.stdout, **kwargs) | |
def jsonDump(obj, filePath, *args, **kwargs): | |
dir = os.path.dirname(filePath) | |
if dir: | |
os.makedirs(dir, exist_ok=True) | |
with open(filePath, "w") as f: | |
json.dump(obj, f, *args, **kwargs) | |
def jsonLoad(filePath, hasComments=False): | |
with open(filePath, encoding="utf-8") as f: | |
if hasComments: | |
return jsonComment.load(f) | |
else: | |
return json.load(f) | |
def assertMultiProcessingPool(procs=None): | |
""" Make pool for parellel computations """ | |
def initProcess(): | |
# Signal setzen | |
signal.signal(signal.SIGINT, signal.SIG_IGN) | |
poolSize = len(psutil.Process().cpu_affinity()) if procs is None else procs | |
return multiprocessing.Pool(poolSize, initProcess) | |
def chunks(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield lst[i:i + n] | |
# Parallel function to list all | |
# blobs with paths in all trees | |
def getBlobs(repoDir, trees): | |
""" Collect all blobs together with all paths inside each tree | |
in `trees`. | |
""" | |
allBlobs = {} | |
for treeSha in trees: | |
lines = get( | |
["git", "ls-tree", "--full-name", "--full-tree", "-r", "{0}".format(treeSha)], | |
cwd=repoDir, | |
encoding="utf-8", | |
) | |
for l in lines: | |
s = l.split(" ", maxsplit=2) | |
s = s[0:-1] + s[-1].split("\t", maxsplit=1) | |
if len(s) != 4: | |
raise ValueError(s) | |
blobSha = s[2] | |
path = s[-1] | |
allBlobs.setdefault(blobSha, set()).add(path) | |
return allBlobs | |
def combine(results): | |
""" Combine all results """ | |
allBlobs = {} | |
for r in tqdm(results): | |
for sha, paths in r.items(): | |
allBlobs.setdefault(sha, set()).update(paths) | |
data = [{ | |
"sha": sha, | |
"paths": list(paths), | |
"size": None, | |
"size:disk": None, | |
"fileType": None | |
} for sha, paths in allBlobs.items()] | |
return data, list(allBlobs.keys()) | |
def getSizes(repoDir, shas): | |
t = tempfile.TemporaryFile(mode="w+b") | |
inp = ["{0}\n".format(sha).encode("utf-8") for sha in shas] | |
t.writelines(inp) | |
t.seek(0) | |
out = tempfile.TemporaryFile(mode="w+b") | |
subprocess.check_call(["git", "cat-file", "--batch-check=%(objectsize),%(objectsize:disk)"], | |
cwd=repoDir, | |
stdin=t, | |
stdout=out) | |
out.seek(0) | |
return [l.decode("utf-8").strip().split(",") for l in out.readlines()] | |
def classifyBlob(repoDir, blobSha): | |
# Hardcore by git and 'is_binary' check | |
tempPath = tempfile.mkstemp()[1] | |
with open(tempPath, "wb") as f: | |
cmd = "git show {0}".format(blobSha) | |
subprocess.check_call(cmd, shell=True, cwd=repoDir, stdout=f) | |
return "binary" if is_binary(tempPath) else "text" | |
if __name__ == "__main__": | |
argparser = argparse.ArgumentParser(prog="GetBlobs", description="Get all blobs") | |
argparser.add_argument("-r", '--repoDir', required=True, help="Git repository") | |
argparser.add_argument( | |
"-g", | |
'--debug', | |
required=False, | |
action="store_true", | |
help="Spit out only from HEAD~1..HEAD", | |
) | |
argparser.add_argument( | |
"-c", | |
'--chunkSize', | |
required=False, | |
type=int, | |
default=512, | |
help="Number of chunks to use for parallel processing trees" | |
) | |
argparser.add_argument("-t", '--tmpDir', required=False, default=None, help="Temporary directory to use") | |
argparser.add_argument("-a", '--auxDir', required=True, help="Auxillary output directory.") | |
argparser.add_argument("-o", '--outputFile', required=True, help="Output directory.") | |
args = argparser.parse_args() | |
tempfile.tempdir = args.tmpDir | |
pool = assertMultiProcessingPool(os.cpu_count()) | |
# Get all objects in the history | |
what = "--all" if not args.debug else "HEAD~5..HEAD" | |
objects = get( | |
"git rev-list --objects --full-history --filter=tree:1 {0} | ".format(what) + | |
"git cat-file --batch-check='%(objecttype),%(objectname),%(rest)'", | |
cwd=args.repoDir, | |
shell=True, | |
encoding="utf-8" | |
) | |
print("Got '{0}' objects from commits in history".format(len(objects))) | |
# Get only all `tree` objects | |
trees = [] | |
for o in objects: | |
t = o.strip().split(",") | |
if len(t) >= 2 and t[0] == "tree": | |
trees.append(t[1]) | |
print("Got '{0}' trees in history".format(len(trees))) | |
nSize = max(1, int(len(trees) / args.chunkSize)) | |
cks = list(chunks(trees, nSize)) | |
results = list(tqdm( | |
pool.imap_unordered( | |
partial(getBlobs, args.repoDir), | |
cks, | |
), | |
total=len(cks), | |
)) | |
print("Accumulating '{0}' chunks ...".format(len(results))) | |
allBlobs, allBlobSha = combine(results) | |
print("Get size for '{0}' blobs ...".format(len(allBlobSha))) | |
nSize = max(1, int(len(allBlobSha) / args.chunkSize)) | |
cks = list(chunks(allBlobSha, nSize)) | |
allSizes = list(tqdm( | |
pool.imap( | |
partial(getSizes, args.repoDir), | |
cks, | |
), | |
total=len(cks), | |
)) | |
allSizes = reduce(operator.iconcat, allSizes, []) | |
assert len(allBlobSha) == len(allSizes), "Wrong number of sizes returned" | |
for data, size in zip(allBlobs, allSizes): | |
data["size"] = int(size[0]) | |
data["size:disk"] = int(size[1]) | |
if allBlobs and allBlobs[-1]["sha"] != allBlobSha[-1]: | |
raise RuntimeError("Programming error") | |
aux = os.path.join(args.auxDir, "allBlobs.json") | |
print("Dumping all blobs to '{0}".format(aux)) | |
jsonDump(allBlobs, aux) | |
print("Classify '{0}' blobs into binary or text...".format(len(allBlobs))) | |
allTypes = list( | |
tqdm( | |
pool.imap( | |
partial(classifyBlob, args.repoDir), | |
allBlobSha, | |
chunksize=64, | |
), | |
total=len(allBlobSha), | |
) | |
) | |
for data, fileType in zip(allBlobs, allTypes): | |
data["fileType"] = fileType | |
print("Saving all '{0}' blobs to '{1} ...".format(len(allBlobs), args.outputFile)) | |
jsonDump( | |
allBlobs, | |
args.outputFile, | |
indent=2, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment