-
-
Save afternoon/1433794 to your computer and use it in GitHub Desktop.
#!/usr/bin/python | |
# | |
# git-slim | |
# | |
# Remove big files from git repo history. | |
# | |
# Requires GitPython (https://github.com/gitpython-developers/GitPython) | |
# | |
# References: | |
# - http://help.github.com/remove-sensitive-data/ | |
# - http://stackoverflow.com/questions/4444091/git-filter-branch-to-delete-large-file | |
# - http://stackoverflow.com/questions/1029969/why-is-my-git-repository-so-big/1036595#1036595 | |
# - http://stackoverflow.com/questions/460331/git-finding-a-filename-from-a-sha1 | |
from glob import glob | |
from git import Repo | |
from os.path import getsize | |
from re import split | |
from shutil import rmtree | |
from sys import argv, exit, stdout | |
def print_activity(start, end='done'): | |
'''Decorator which logs info like "Doing something: done" to stdout.''' | |
def decorate(f): | |
def wrapped(*args, **kwargs): | |
stdout.write('%s: ' % start) | |
stdout.flush() | |
x = f(*args, **kwargs) | |
print end | |
return x | |
return wrapped | |
return decorate | |
def slim_main(): | |
'''Invoke slimming on working directory or first argv entry.''' | |
repo_dir = argv[1] if len(argv) > 1 else '.' | |
try: | |
slim(repo_dir) | |
except KeyboardInterrupt: | |
exit(0) | |
def slim(repo_dir): | |
r = Repo(repo_dir) | |
prep(r) | |
old_size = repo_size(r) | |
slim_blobs(r) | |
tidy_up(r) | |
new_size = repo_size(r) | |
ok_done(old_size, new_size) | |
def repo_size(r): | |
return getsize(r.git_dir) | |
def prep(r): | |
'''Prep a repo by running GC and repacking.''' | |
if r.is_dirty(): | |
raise Exception('repo is dirty') | |
gc(r) | |
repack(r) | |
def slim_blobs(r): | |
'''Reduce repo size by listing blobs in size order and asking the user if | |
they would like to remove them. | |
''' | |
pack_blobs = list_pack_blobs_by_size(r) | |
index = blob_index(r) | |
seen = [] | |
targets = [] | |
for b in pack_blobs: | |
if b[0] not in index: | |
print '%s not in blob index' % b[0] | |
else: | |
blob_path, commit_hexsha = index[b[0]] | |
if blob_path not in seen: | |
blob_size = format_size(b[1]) | |
commit_hexsha_prefix = commit_hexsha[:7] | |
prompt = 'Remove %s (%s at %s)? [Y/n/d] ' % \ | |
(blob_path, blob_size, commit_hexsha_prefix) | |
answer = raw_input(prompt).strip().lower() | |
if answer == 'd': | |
break | |
elif answer in ('y', ''): | |
targets.append(blob_path) | |
seen.append(blob_path) | |
remove_files(r, targets) | |
def blob_index(r): | |
'''Build index of paths of blobs in the repo. Iterates across all files in | |
all commits and records blob used. | |
''' | |
desc = 'Indexing blobs in commits: ' | |
index = {} | |
commits = list(r.iter_commits()) | |
commits_len = len(commits) | |
blob_predicate = lambda i, d: i.type == 'blob' | |
i = 1 | |
for commit in commits: | |
stdout.write('\r%s(%s/%s)' % (desc, i, commits_len)) | |
stdout.flush() | |
for blob in commit.tree.traverse(predicate=blob_predicate): | |
index[blob.hexsha] = blob.path, str(commit) | |
i += 1 | |
print '\r%sdone ' % desc | |
return index | |
@print_activity('Listing pack blobs') | |
def list_pack_blobs_by_size(r): | |
blobs = list_pack_blobs(r) | |
blobs_s = sorted(blobs, key=lambda b: b[1], reverse=True) | |
return blobs_s | |
def list_pack_blobs(r): | |
'''Call git verify-pack to dump info about blobs in a pack.''' | |
pack_index_glob = r.git_dir + '/objects/pack/pack-*.idx' | |
pack_index_files = glob(pack_index_glob) | |
pack_info = r.git.verify_pack(*pack_index_files, verbose=True) | |
return extract_blob_info(pack_info) | |
def extract_blob_info(pack_info): | |
'''Extract info about blobs in a pack from text returned by git verify-pack. | |
''' | |
for line in pack_info.split('\n'): | |
bits = split(r'\s+', line) | |
if len(bits) > 1 and bits[1] == 'blob': | |
yield bits[0], int(bits[3]) | |
def format_size(num): | |
'''Format numbers as file sizes. From hurry.filesize.''' | |
for x in [' bytes', 'KB', 'MB', 'GB', 'TB']: | |
if num < 1024.0: | |
return "%.0f%s" % (num, x) | |
num /= 1024.0 | |
@print_activity('Removing files from repo history') | |
def remove_files(r, fs): | |
'''Run git rm for each file in list against each commit using git | |
filter-branch. Completely removes files from repo history. | |
''' | |
if not fs: | |
return | |
# todo: check file list doesn't exceed max command length | |
filelist = ' '.join(fs) | |
r.git.filter_branch('--index-filter', | |
'git rm --cached --ignore-unmatch %s' % filelist, | |
'--prune-empty', | |
'HEAD') | |
def tidy_up(r): | |
'''Tidy up by expiring reflog, aggresively GCing repo and repacking. Should | |
recover space used by objects removed during slimming process. | |
''' | |
rm_original_refs(r) | |
expire_reflog(r) | |
gc(r) | |
repack(r) | |
@print_activity('Removing original refs') | |
def rm_original_refs(r): | |
rmtree(r.git_dir + '/refs/original/', ignore_errors=True) | |
@print_activity('Expiring reflog') | |
def expire_reflog(r): | |
r.git.reflog('expire', '--expire=now', '--all') | |
@print_activity('Garbage collecting') | |
def gc(r): | |
r.git.gc(prune=True) | |
@print_activity('Repacking') | |
def repack(r): | |
r.git.repack(a=True, d=True, q=True) | |
def ok_done(old_size, new_size): | |
delta = format_size(old_size - new_size) | |
old_f = format_size(old_size) | |
new_f = format_size(new_size) | |
print '\nRepo slimmed by %s (reduced from %s to %s)' % (delta, old_f, new_f) | |
print '(Running \'git gc --agressive --prune\' may reclaim further space)\n' | |
print 'Next run \'git push origin --all --force\'' | |
print 'Then re-clone all copies of the repo' | |
print 'Warning: If an old clone is used, big objects may reappear' | |
if __name__ == '__main__': | |
slim_main() |
This exploded for me:
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
File "/Users/benson/bin/git-slim.py", line 208, in <module>
slim_main()
File "/Users/benson/bin/git-slim.py", line 40, in slim_main
slim(repo_dir)
File "/Users/benson/bin/git-slim.py", line 49, in slim
slim_blobs(r)
File "/Users/benson/bin/git-slim.py", line 73, in slim_blobs
index = blob_index(r)
File "/Users/benson/bin/git-slim.py", line 102, in blob_index
commits = list(r.iter_commits())
File "/Library/Python/2.7/site-packages/git/repo/base.py", line 423, in iter_commits
rev = self.head.commit
File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 168, in _get_commit
obj = self._get_object()
File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 161, in _get_object
return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
File "/Library/Python/2.7/site-packages/git/objects/base.py", line 64, in new_from_sha
oinfo = repo.odb.info(sha1)
File "/Library/Python/2.7/site-packages/gitdb/db/base.py", line 256, in info
return self._db_query(sha).info(sha)
File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 162, in info
m = self._map_loose_object(sha)
File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 146, in _map_loose_object
raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 59a968e4b7bf20039a9314c383a7bb5aa955b53c
Quick look at the code: Won’t it only run git filter-branch
on the current branch instead of the whole repo?
And since this is such a potentially destructive script, you might want to add a notice about that.
@bimargulies, this script failed for me as well, with the identical error. For the time being this script probably shouldn't be used, especially since as @Chronial points out it's potentially destructive.
python git-slim.py
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
File "git-slim.py", line 208, in <module>
slim_main()
File "git-slim.py", line 40, in slim_main
slim(repo_dir)
File "git-slim.py", line 49, in slim
slim_blobs(r)
File "git-slim.py", line 73, in slim_blobs
index = blob_index(r)
File "git-slim.py", line 102, in blob_index
commits = list(r.iter_commits())
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/repo/base.py", line 424, in iter_commits
rev = self.head.commit
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 168, in _get_commit
obj = self._get_object()
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 161, in _get_object
return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/objects/base.py", line 64, in new_from_sha
oinfo = repo.odb.info(sha1)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/base.py", line 256, in info
return self._db_query(sha).info(sha)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 162, in info
m = self._map_loose_object(sha)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 146, in _map_loose_object
raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 45bb1fbbb91af857c8566fd30fe59d6dfee0d63d
Failed on OS X 10.10
python git-slim.py
Traceback (most recent call last):
File "git-slim.py", line 208, in <module>
slim_main()
File "git-slim.py", line 40, in slim_main
slim(repo_dir)
File "git-slim.py", line 47, in slim
prep(r)
File "git-slim.py", line 61, in prep
if r.is_dirty():
TypeError: 'bool' object is not callable
I get this error :
File "git-slim.py", line 16, in
from git import Repo
ImportError: No module named git
This means the library is not found. Where can I get the library for this?
@greenspray, you can google how to install gitpython.
I got the following error:
File "git-slim.py", line 30
print end
^
SyntaxError: Missing parentheses in call to 'print'. Did you mean print(end)?
I got the following error:
File "git-slim.py", line 30
print end
^
SyntaxError: Missing parentheses in call to 'print'. Did you mean print(end)?
I got the same error, I'm using Python 3.6.3
Worked perfectly
If your system has both python 3 and python 2 then this script is python2 friendly so ... python2 ./git-slim.py
Thanks for the little script. It made this timely process somewhat easier Thanks again.
Dustin