Created
January 24, 2019 16:30
-
-
Save MatthewWilkes/da2c43a407b3c3a8e6bff8a8cc0caa93 to your computer and use it in GitHub Desktop.
Extract deleted commits from a GitHub repo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import re | |
import subprocess | |
import tempfile | |
import requests | |
def get_repo(owner, repo): | |
"""Check the repo out to a temporary directory""" | |
tempdir = tempfile.mkdtemp() | |
url = "http://github.com/%s/%s.git" % (owner, repo) | |
subprocess.check_output(['git', 'clone', url, tempdir]) | |
return tempdir | |
def get_shas(checkout): | |
"""List all reachable commits from a checkout""" | |
os.chdir(checkout) | |
commits = subprocess.check_output(['git', 'repack']) | |
packdir = os.path.join(checkout, '.git', 'objects', 'pack') | |
packfile = [filename for filename in os.listdir(packdir) if '.idx' in filename] | |
packfile = os.path.join(packdir, packfile[0]) | |
shas = subprocess.check_output(['git', 'verify-pack', '-v', packfile]) | |
shas = {sha[:4] for sha in shas.split("\n") if 'object' not in sha and sha} | |
return shas | |
def potential_hashes(checkout, shas=None): | |
"""Get all 4 digit hashes that aren't in our known list""" | |
if shas is None: | |
shas = get_shas(checkout) | |
all_possibles = {'%04x' % possible for possible in xrange(16**4)} | |
candidates = all_possibles - shas | |
return candidates | |
def get_parents_for_commit(owner, repo, commit): | |
"""Get the parents from the API. We can't use git as it won't show us hidden | |
commits, but we have the full hash so we can use the API and avoid hitting | |
the frontend""" | |
url = "https://api.github.com/repos/%s/%s/commits/%s" % (owner, repo, commit) | |
response = requests.get(url).json() | |
for parent in response['parents']: | |
yield parent['sha'] | |
def get_all_ancestors(owner, repo, commit, break_on=None): | |
"""Given a hash, iterate over all of its ancestors, with an optional | |
set of commits we have seen already to ignore.""" | |
if break_on is None: | |
break_on = set() | |
# Recurse over the parents, so we can follow both sides of a merge | |
parents = get_parents_for_commit(owner, repo, commit) | |
for parent in parents: | |
# We've seen this commit before, stop processing | |
if parent[:4] not in break_on: | |
yield parent | |
for ancestor in get_all_ancestors(owner, repo, parent, break_on): | |
yield ancestor | |
def check_candidate(owner, repo, candidate, known_shas): | |
"""Given repo information and a 4 digit prefix, try and find that commit, | |
then iterate over its ancestors if it exists. Otherwise return an empty | |
iterator""" | |
url = 'https://github.com/%s/%s/commit/%s' % (owner, repo, candidate) | |
response = requests.get(url) | |
if response.status_code == 200: | |
# This is a real commit, find the full hash | |
full_hash = re.compile('/commit/(%s.*?)\"' % candidate).findall(response.text)[0] | |
# Make a note of the prefix, so we don't re-scan this tree | |
known_shas.add(full_hash[:4]) | |
yield full_hash | |
# Yield from all ancestors of this commit, until we hit a known commit | |
for ancestor in get_all_ancestors(owner, repo, full_hash, known_shas): | |
known_shas.add(ancestor[:4]) | |
yield ancestor | |
else: | |
return | |
def main(): | |
parser = argparse.ArgumentParser(description='Find unreachable commits') | |
parser.add_argument('owner', type=str, | |
help='the repository owner') | |
parser.add_argument('repo', type=str, | |
help='the repository name') | |
args = parser.parse_args() | |
# Get the git repo and find the candidates we will check for | |
repository = get_repo(args.owner, args.repo) | |
print "Extracting known identifiers" | |
known_shas = get_shas(repository) | |
potential_commits = potential_hashes(repository, shas=known_shas) | |
print "There are %d potential commits to check for" % (len(potential_commits)) | |
# Keep track of the last completion percentage, so we can report to the user during big scans | |
percentage_complete = 0 | |
for i, candidate in enumerate(potential_commits): | |
current_percentage = round(((i+1.0) / len(potential_commits)) * 100) | |
# Loop over ancestors for a 4 digit hash. If the hash doesn't exist, no ancestors are returned | |
for parent in check_candidate(args.owner, args.repo, candidate, known_shas): | |
print "Found hidden commit https://github.com/%s/%s/commit/%s" % (args.owner, args.repo, parent) | |
if current_percentage > percentage_complete: | |
print "%d%% complete" % (int(current_percentage)) | |
percentage_complete = current_percentage | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment