Skip to content

Instantly share code, notes, and snippets.

@gabyx
Created June 19, 2020 13:55
Show Gist options
  • Save gabyx/2efe9d2bb7c0931a92d4be9406dcd9ca to your computer and use it in GitHub Desktop.
Save gabyx/2efe9d2bb7c0931a92d4be9406dcd9ca to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import argparse
import tempfile
import operator
from functools import partial, reduce
import subprocess
from tqdm import tqdm
from binaryornot.check import is_binary
import sys
import signal
import subprocess
import multiprocessing
import psutil
import commentjson as jsonComment
import json as json
import requests
def check(cmd, stderr=None, stdout=None, **kwargs):
""" Check a call:
Note: check(r'git check-attr --cached --all -- "{0}" | grep -q "filter: lfs"'.format(blob.path), shell=True)
Quotes '{0}' would not work.
"""
try:
subprocess.check_call(cmd, stdout=stdout, stderr=stderr, **kwargs)
return True
except subprocess.CalledProcessError:
return False
def get(cmd, raw=False, stderr=None, pipefail=True, **kwargs):
if kwargs.get("shell", False) and pipefail:
# We launch a pipe cmd and want "set -o pipefail"
# wrap with bash
assert isinstance(cmd, str), "{0} is not a string".format(cmd)
cmd = ['bash', '-o', 'pipefail', '-c', cmd]
kwargs["shell"] = False
out = subprocess.check_output(cmd, stderr=stderr, **kwargs)
if raw:
return out
else:
return out.strip().split('\n') if out else []
def log(*args, **kwargs):
print(*args, file=sys.stdout, **kwargs)
def jsonDump(obj, filePath, *args, **kwargs):
dir = os.path.dirname(filePath)
if dir:
os.makedirs(dir, exist_ok=True)
with open(filePath, "w") as f:
json.dump(obj, f, *args, **kwargs)
def jsonLoad(filePath, hasComments=False):
with open(filePath, encoding="utf-8") as f:
if hasComments:
return jsonComment.load(f)
else:
return json.load(f)
def assertMultiProcessingPool(procs=None):
""" Make pool for parellel computations """
def initProcess():
# Signal setzen
signal.signal(signal.SIGINT, signal.SIG_IGN)
poolSize = len(psutil.Process().cpu_affinity()) if procs is None else procs
return multiprocessing.Pool(poolSize, initProcess)
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
# Parallel function to list all
# blobs with paths in all trees
def getBlobs(repoDir, trees):
""" Collect all blobs together with all paths inside each tree
in `trees`.
"""
allBlobs = {}
for treeSha in trees:
lines = get(
["git", "ls-tree", "--full-name", "--full-tree", "-r", "{0}".format(treeSha)],
cwd=repoDir,
encoding="utf-8",
)
for l in lines:
s = l.split(" ", maxsplit=2)
s = s[0:-1] + s[-1].split("\t", maxsplit=1)
if len(s) != 4:
raise ValueError(s)
blobSha = s[2]
path = s[-1]
allBlobs.setdefault(blobSha, set()).add(path)
return allBlobs
def combine(results):
""" Combine all results """
allBlobs = {}
for r in tqdm(results):
for sha, paths in r.items():
allBlobs.setdefault(sha, set()).update(paths)
data = [{
"sha": sha,
"paths": list(paths),
"size": None,
"size:disk": None,
"fileType": None
} for sha, paths in allBlobs.items()]
return data, list(allBlobs.keys())
def getSizes(repoDir, shas):
t = tempfile.TemporaryFile(mode="w+b")
inp = ["{0}\n".format(sha).encode("utf-8") for sha in shas]
t.writelines(inp)
t.seek(0)
out = tempfile.TemporaryFile(mode="w+b")
subprocess.check_call(["git", "cat-file", "--batch-check=%(objectsize),%(objectsize:disk)"],
cwd=repoDir,
stdin=t,
stdout=out)
out.seek(0)
return [l.decode("utf-8").strip().split(",") for l in out.readlines()]
def classifyBlob(repoDir, blobSha):
# Hardcore by git and 'is_binary' check
tempPath = tempfile.mkstemp()[1]
with open(tempPath, "wb") as f:
cmd = "git show {0}".format(blobSha)
subprocess.check_call(cmd, shell=True, cwd=repoDir, stdout=f)
return "binary" if is_binary(tempPath) else "text"
if __name__ == "__main__":
argparser = argparse.ArgumentParser(prog="GetBlobs", description="Get all blobs")
argparser.add_argument("-r", '--repoDir', required=True, help="Git repository")
argparser.add_argument(
"-g",
'--debug',
required=False,
action="store_true",
help="Spit out only from HEAD~1..HEAD",
)
argparser.add_argument(
"-c",
'--chunkSize',
required=False,
type=int,
default=512,
help="Number of chunks to use for parallel processing trees"
)
argparser.add_argument("-t", '--tmpDir', required=False, default=None, help="Temporary directory to use")
argparser.add_argument("-a", '--auxDir', required=True, help="Auxillary output directory.")
argparser.add_argument("-o", '--outputFile', required=True, help="Output directory.")
args = argparser.parse_args()
tempfile.tempdir = args.tmpDir
pool = assertMultiProcessingPool(os.cpu_count())
# Get all objects in the history
what = "--all" if not args.debug else "HEAD~5..HEAD"
objects = get(
"git rev-list --objects --full-history --filter=tree:1 {0} | ".format(what) +
"git cat-file --batch-check='%(objecttype),%(objectname),%(rest)'",
cwd=args.repoDir,
shell=True,
encoding="utf-8"
)
print("Got '{0}' objects from commits in history".format(len(objects)))
# Get only all `tree` objects
trees = []
for o in objects:
t = o.strip().split(",")
if len(t) >= 2 and t[0] == "tree":
trees.append(t[1])
print("Got '{0}' trees in history".format(len(trees)))
nSize = max(1, int(len(trees) / args.chunkSize))
cks = list(chunks(trees, nSize))
results = list(tqdm(
pool.imap_unordered(
partial(getBlobs, args.repoDir),
cks,
),
total=len(cks),
))
print("Accumulating '{0}' chunks ...".format(len(results)))
allBlobs, allBlobSha = combine(results)
print("Get size for '{0}' blobs ...".format(len(allBlobSha)))
nSize = max(1, int(len(allBlobSha) / args.chunkSize))
cks = list(chunks(allBlobSha, nSize))
allSizes = list(tqdm(
pool.imap(
partial(getSizes, args.repoDir),
cks,
),
total=len(cks),
))
allSizes = reduce(operator.iconcat, allSizes, [])
assert len(allBlobSha) == len(allSizes), "Wrong number of sizes returned"
for data, size in zip(allBlobs, allSizes):
data["size"] = int(size[0])
data["size:disk"] = int(size[1])
if allBlobs and allBlobs[-1]["sha"] != allBlobSha[-1]:
raise RuntimeError("Programming error")
aux = os.path.join(args.auxDir, "allBlobs.json")
print("Dumping all blobs to '{0}".format(aux))
jsonDump(allBlobs, aux)
print("Classify '{0}' blobs into binary or text...".format(len(allBlobs)))
allTypes = list(
tqdm(
pool.imap(
partial(classifyBlob, args.repoDir),
allBlobSha,
chunksize=64,
),
total=len(allBlobSha),
)
)
for data, fileType in zip(allBlobs, allTypes):
data["fileType"] = fileType
print("Saving all '{0}' blobs to '{1} ...".format(len(allBlobs), args.outputFile))
jsonDump(
allBlobs,
args.outputFile,
indent=2,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment