gabyx · June 19, 2020 13:55
diff --git a/getBlobs.py b/getBlobs.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 import os
 import argparse
 import tempfile
 import operator
 from functools import partial, reduce
 import subprocess
 from tqdm import tqdm
 from binaryornot.check import is_binary
 import sys
 import signal
 import subprocess
 import multiprocessing
 import psutil
 import commentjson as jsonComment
 import json as json
 import requests


 def check(cmd, stderr=None, stdout=None, **kwargs):
    """ Check a call:
        Note: check(r'git check-attr --cached --all -- "{0}" | grep -q "filter: lfs"'.format(blob.path), shell=True)
        Quotes '{0}' would not work.
    """
    try:
        subprocess.check_call(cmd, stdout=stdout, stderr=stderr, **kwargs)
        return True
    except subprocess.CalledProcessError:
        return False


 def get(cmd, raw=False, stderr=None, pipefail=True, **kwargs):
    if kwargs.get("shell", False) and pipefail:
        # We launch a pipe cmd and want "set -o pipefail"
        # wrap with bash
        assert isinstance(cmd, str), "{0} is not a string".format(cmd)
        cmd = ['bash', '-o', 'pipefail', '-c', cmd]
        kwargs["shell"] = False

    out = subprocess.check_output(cmd, stderr=stderr, **kwargs)
    if raw:
        return out
    else:
        return out.strip().split('\n') if out else []


 def log(*args, **kwargs):
    print(*args, file=sys.stdout, **kwargs)


 def jsonDump(obj, filePath, *args, **kwargs):
    dir = os.path.dirname(filePath)
    if dir:
        os.makedirs(dir, exist_ok=True)
    with open(filePath, "w") as f:
        json.dump(obj, f, *args, **kwargs)


 def jsonLoad(filePath, hasComments=False):
    with open(filePath, encoding="utf-8") as f:
        if hasComments:
            return jsonComment.load(f)
        else:
            return json.load(f)


 def assertMultiProcessingPool(procs=None):
    """ Make pool for parellel computations """

    def initProcess():
        # Signal setzen
        signal.signal(signal.SIGINT, signal.SIG_IGN)

    poolSize = len(psutil.Process().cpu_affinity()) if procs is None else procs
    return multiprocessing.Pool(poolSize, initProcess)



 def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


 # Parallel function to list all
 # blobs with paths in all trees
 def getBlobs(repoDir, trees):
    """ Collect all blobs together with all paths inside each tree
        in `trees`.
    """
    allBlobs = {}
    for treeSha in trees:

        lines = get(
            ["git", "ls-tree", "--full-name", "--full-tree", "-r", "{0}".format(treeSha)],
            cwd=repoDir,
            encoding="utf-8",
        )

        for l in lines:
            s = l.split(" ", maxsplit=2)
            s = s[0:-1] + s[-1].split("\t", maxsplit=1)
            if len(s) != 4:
                raise ValueError(s)

            blobSha = s[2]
            path = s[-1]
            allBlobs.setdefault(blobSha, set()).add(path)

    return allBlobs


 def combine(results):
    """ Combine all results """
    allBlobs = {}
    for r in tqdm(results):
        for sha, paths in r.items():
            allBlobs.setdefault(sha, set()).update(paths)

    data = [{
        "sha": sha,
        "paths": list(paths),
        "size": None,
        "size:disk": None,
        "fileType": None
    } for sha, paths in allBlobs.items()]

    return data, list(allBlobs.keys())


 def getSizes(repoDir, shas):
    t = tempfile.TemporaryFile(mode="w+b")
    inp = ["{0}\n".format(sha).encode("utf-8") for sha in shas]
    t.writelines(inp)
    t.seek(0)

    out = tempfile.TemporaryFile(mode="w+b")
    subprocess.check_call(["git", "cat-file", "--batch-check=%(objectsize),%(objectsize:disk)"],
                          cwd=repoDir,
                          stdin=t,
                          stdout=out)
    out.seek(0)
    return [l.decode("utf-8").strip().split(",") for l in out.readlines()]


 def classifyBlob(repoDir, blobSha):
    # Hardcore by git and 'is_binary' check
    tempPath = tempfile.mkstemp()[1]
    with open(tempPath, "wb") as f:
        cmd = "git show {0}".format(blobSha)
        subprocess.check_call(cmd, shell=True, cwd=repoDir, stdout=f)
    return "binary" if is_binary(tempPath) else "text"


 if __name__ == "__main__":
    argparser = argparse.ArgumentParser(prog="GetBlobs", description="Get all blobs")
    argparser.add_argument("-r", '--repoDir', required=True, help="Git repository")
    argparser.add_argument(
        "-g",
        '--debug',
        required=False,
        action="store_true",
        help="Spit out only from HEAD~1..HEAD",
    )
    argparser.add_argument(
        "-c",
        '--chunkSize',
        required=False,
        type=int,
        default=512,
        help="Number of chunks to use for parallel processing trees"
    )
    argparser.add_argument("-t", '--tmpDir', required=False, default=None, help="Temporary directory to use")
    argparser.add_argument("-a", '--auxDir', required=True, help="Auxillary output directory.")
    argparser.add_argument("-o", '--outputFile', required=True, help="Output directory.")

    args = argparser.parse_args()

    tempfile.tempdir = args.tmpDir
    pool = assertMultiProcessingPool(os.cpu_count())

    # Get all objects in the history
    what = "--all" if not args.debug else "HEAD~5..HEAD"
    objects = get(
        "git rev-list --objects --full-history --filter=tree:1 {0} | ".format(what) +
        "git cat-file --batch-check='%(objecttype),%(objectname),%(rest)'",
        cwd=args.repoDir,
        shell=True,
        encoding="utf-8"
    )

    print("Got '{0}' objects from commits in history".format(len(objects)))

    # Get only all `tree` objects
    trees = []
    for o in objects:
        t = o.strip().split(",")
        if len(t) >= 2 and t[0] == "tree":
            trees.append(t[1])

    print("Got '{0}' trees in history".format(len(trees)))

    nSize = max(1, int(len(trees) / args.chunkSize))
    cks = list(chunks(trees, nSize))

    results = list(tqdm(
        pool.imap_unordered(
            partial(getBlobs, args.repoDir),
            cks,
        ),
        total=len(cks),
    ))

    print("Accumulating '{0}' chunks ...".format(len(results)))
    allBlobs, allBlobSha = combine(results)
    print("Get size for '{0}' blobs ...".format(len(allBlobSha)))
    nSize = max(1, int(len(allBlobSha) / args.chunkSize))
    cks = list(chunks(allBlobSha, nSize))
    allSizes = list(tqdm(
        pool.imap(
            partial(getSizes, args.repoDir),
            cks,
        ),
        total=len(cks),
    ))

    allSizes = reduce(operator.iconcat, allSizes, [])
    assert len(allBlobSha) == len(allSizes), "Wrong number of sizes returned"
    for data, size in zip(allBlobs, allSizes):
        data["size"] = int(size[0])
        data["size:disk"] = int(size[1])

    if allBlobs and allBlobs[-1]["sha"] != allBlobSha[-1]:
        raise RuntimeError("Programming error")

    aux = os.path.join(args.auxDir, "allBlobs.json")
    print("Dumping all blobs to '{0}".format(aux))
    jsonDump(allBlobs, aux)

    print("Classify '{0}' blobs into binary or text...".format(len(allBlobs)))
    allTypes = list(
        tqdm(
            pool.imap(
                partial(classifyBlob, args.repoDir),
                allBlobSha,
                chunksize=64,
            ),
            total=len(allBlobSha),
        )
    )

    for data, fileType in zip(allBlobs, allTypes):
        data["fileType"] = fileType

    print("Saving all '{0}' blobs to '{1} ...".format(len(allBlobs), args.outputFile))
    jsonDump(
        allBlobs,
        args.outputFile,
        indent=2,
    )
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import os
	import argparse
	import tempfile
	import operator
	from functools import partial, reduce
	import subprocess
	from tqdm import tqdm
	from binaryornot.check import is_binary
	import sys
	import signal
	import subprocess
	import multiprocessing
	import psutil
	import commentjson as jsonComment
	import json as json
	import requests


	def check(cmd, stderr=None, stdout=None, **kwargs):
	""" Check a call:
	Note: check(r'git check-attr --cached --all -- "{0}" \| grep -q "filter: lfs"'.format(blob.path), shell=True)
	Quotes '{0}' would not work.
	"""
	try:
	subprocess.check_call(cmd, stdout=stdout, stderr=stderr, **kwargs)
	return True
	except subprocess.CalledProcessError:
	return False


	def get(cmd, raw=False, stderr=None, pipefail=True, **kwargs):
	if kwargs.get("shell", False) and pipefail:
	# We launch a pipe cmd and want "set -o pipefail"
	# wrap with bash
	assert isinstance(cmd, str), "{0} is not a string".format(cmd)
	cmd = ['bash', '-o', 'pipefail', '-c', cmd]
	kwargs["shell"] = False

	out = subprocess.check_output(cmd, stderr=stderr, **kwargs)
	if raw:
	return out
	else:
	return out.strip().split('\n') if out else []


	def log(args, *kwargs):
	print(args, file=sys.stdout, *kwargs)


	def jsonDump(obj, filePath, args, *kwargs):
	dir = os.path.dirname(filePath)
	if dir:
	os.makedirs(dir, exist_ok=True)
	with open(filePath, "w") as f:
	json.dump(obj, f, args, *kwargs)


	def jsonLoad(filePath, hasComments=False):
	with open(filePath, encoding="utf-8") as f:
	if hasComments:
	return jsonComment.load(f)
	else:
	return json.load(f)


	def assertMultiProcessingPool(procs=None):
	""" Make pool for parellel computations """

	def initProcess():
	# Signal setzen
	signal.signal(signal.SIGINT, signal.SIG_IGN)

	poolSize = len(psutil.Process().cpu_affinity()) if procs is None else procs
	return multiprocessing.Pool(poolSize, initProcess)



	def chunks(lst, n):
	"""Yield successive n-sized chunks from lst."""
	for i in range(0, len(lst), n):
	yield lst[i:i + n]


	# Parallel function to list all
	# blobs with paths in all trees
	def getBlobs(repoDir, trees):
	""" Collect all blobs together with all paths inside each tree
	in `trees`.
	"""
	allBlobs = {}
	for treeSha in trees:

	lines = get(
	["git", "ls-tree", "--full-name", "--full-tree", "-r", "{0}".format(treeSha)],
	cwd=repoDir,
	encoding="utf-8",
	)

	for l in lines:
	s = l.split(" ", maxsplit=2)
	s = s[0:-1] + s[-1].split("\t", maxsplit=1)
	if len(s) != 4:
	raise ValueError(s)

	blobSha = s[2]
	path = s[-1]
	allBlobs.setdefault(blobSha, set()).add(path)

	return allBlobs


	def combine(results):
	""" Combine all results """
	allBlobs = {}
	for r in tqdm(results):
	for sha, paths in r.items():
	allBlobs.setdefault(sha, set()).update(paths)

	data = [{
	"sha": sha,
	"paths": list(paths),
	"size": None,
	"size:disk": None,
	"fileType": None
	} for sha, paths in allBlobs.items()]

	return data, list(allBlobs.keys())


	def getSizes(repoDir, shas):
	t = tempfile.TemporaryFile(mode="w+b")
	inp = ["{0}\n".format(sha).encode("utf-8") for sha in shas]
	t.writelines(inp)
	t.seek(0)

	out = tempfile.TemporaryFile(mode="w+b")
	subprocess.check_call(["git", "cat-file", "--batch-check=%(objectsize),%(objectsize:disk)"],
	cwd=repoDir,
	stdin=t,
	stdout=out)
	out.seek(0)
	return [l.decode("utf-8").strip().split(",") for l in out.readlines()]


	def classifyBlob(repoDir, blobSha):
	# Hardcore by git and 'is_binary' check
	tempPath = tempfile.mkstemp()[1]
	with open(tempPath, "wb") as f:
	cmd = "git show {0}".format(blobSha)
	subprocess.check_call(cmd, shell=True, cwd=repoDir, stdout=f)
	return "binary" if is_binary(tempPath) else "text"


	if __name__ == "__main__":
	argparser = argparse.ArgumentParser(prog="GetBlobs", description="Get all blobs")
	argparser.add_argument("-r", '--repoDir', required=True, help="Git repository")
	argparser.add_argument(
	"-g",
	'--debug',
	required=False,
	action="store_true",
	help="Spit out only from HEAD~1..HEAD",
	)
	argparser.add_argument(
	"-c",
	'--chunkSize',
	required=False,
	type=int,
	default=512,
	help="Number of chunks to use for parallel processing trees"
	)
	argparser.add_argument("-t", '--tmpDir', required=False, default=None, help="Temporary directory to use")
	argparser.add_argument("-a", '--auxDir', required=True, help="Auxillary output directory.")
	argparser.add_argument("-o", '--outputFile', required=True, help="Output directory.")

	args = argparser.parse_args()

	tempfile.tempdir = args.tmpDir
	pool = assertMultiProcessingPool(os.cpu_count())

	# Get all objects in the history
	what = "--all" if not args.debug else "HEAD~5..HEAD"
	objects = get(
	"git rev-list --objects --full-history --filter=tree:1 {0} \| ".format(what) +
	"git cat-file --batch-check='%(objecttype),%(objectname),%(rest)'",
	cwd=args.repoDir,
	shell=True,
	encoding="utf-8"
	)

	print("Got '{0}' objects from commits in history".format(len(objects)))

	# Get only all `tree` objects
	trees = []
	for o in objects:
	t = o.strip().split(",")
	if len(t) >= 2 and t[0] == "tree":
	trees.append(t[1])

	print("Got '{0}' trees in history".format(len(trees)))

	nSize = max(1, int(len(trees) / args.chunkSize))
	cks = list(chunks(trees, nSize))

	results = list(tqdm(
	pool.imap_unordered(
	partial(getBlobs, args.repoDir),
	cks,
	),
	total=len(cks),
	))

	print("Accumulating '{0}' chunks ...".format(len(results)))
	allBlobs, allBlobSha = combine(results)
	print("Get size for '{0}' blobs ...".format(len(allBlobSha)))
	nSize = max(1, int(len(allBlobSha) / args.chunkSize))
	cks = list(chunks(allBlobSha, nSize))
	allSizes = list(tqdm(
	pool.imap(
	partial(getSizes, args.repoDir),
	cks,
	),
	total=len(cks),
	))

	allSizes = reduce(operator.iconcat, allSizes, [])
	assert len(allBlobSha) == len(allSizes), "Wrong number of sizes returned"
	for data, size in zip(allBlobs, allSizes):
	data["size"] = int(size[0])
	data["size:disk"] = int(size[1])

	if allBlobs and allBlobs[-1]["sha"] != allBlobSha[-1]:
	raise RuntimeError("Programming error")

	aux = os.path.join(args.auxDir, "allBlobs.json")
	print("Dumping all blobs to '{0}".format(aux))
	jsonDump(allBlobs, aux)

	print("Classify '{0}' blobs into binary or text...".format(len(allBlobs)))
	allTypes = list(
	tqdm(
	pool.imap(
	partial(classifyBlob, args.repoDir),
	allBlobSha,
	chunksize=64,
	),
	total=len(allBlobSha),
	)
	)

	for data, fileType in zip(allBlobs, allTypes):
	data["fileType"] = fileType

	print("Saving all '{0}' blobs to '{1} ...".format(len(allBlobs), args.outputFile))
	jsonDump(
	allBlobs,
	args.outputFile,
	indent=2,
	)