Created
September 27, 2015 16:49
-
-
Save jseabold/d325da12a7718d1c74a3 to your computer and use it in GitHub Desktop.
git filter-branch magic using Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import glob | |
import os | |
import shutil | |
import re | |
from collections import namedtuple | |
import subprocess | |
from subprocess import PIPE | |
Row = namedtuple('row', ['size_kb', 'pack_kb', 'sha', 'location']) | |
def split_row(x, obj_file_mapper): | |
row = re.split(' *', x) | |
sha = row[0] | |
# typ = row[1] | |
size = int(row[2])/1024 | |
pack = int(row[3])/1024 | |
# offset = row[4] | |
# the rest of the info. is for objects that aren't deltified | |
location = obj_file_mapper[sha] | |
return Row(size, pack, sha, location) | |
def get_obj_location_mapping(): | |
p = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], | |
stdout=PIPE) | |
output = p.communicate()[0].decode('ascii').split('\n') | |
# get all hashes that map to something, potentially files | |
obj_map = filter(lambda x: len(x.strip().split(' ')) > 1, output) | |
obj_map = map(lambda x: x.split(' ', 1), obj_map) | |
obj_map = dict(obj_map) | |
return obj_map | |
def get_largest_n(n=25, sortby='size'): | |
# you can use size or packed for sort | |
p1 = subprocess.Popen('git verify-pack -v .git/objects/pack/pack-*.idx', | |
shell=True, stdout=PIPE) | |
p2 = subprocess.Popen(['grep', '-v', 'chain'], stdin=p1.stdout, | |
stdout=PIPE) | |
field = {'size': 3, 'packed': 4}[sortby] | |
p3 = subprocess.Popen(['sort', '-k{}nr'.format(field)], stdin=p2.stdout, | |
stdout=PIPE) | |
p4 = subprocess.Popen(['head', '-n', str(n)], stdin=p3.stdout, stdout=PIPE) | |
file_output = p4.communicate()[0].decode('ascii').split('\n')[:-1] | |
obj_file_mapper = get_obj_location_mapping() | |
def row_splitter(x): | |
return split_row(x, obj_file_mapper) | |
*objects, = map(row_splitter, file_output) | |
return objects | |
def filter_branch(loc): | |
# removes object at location from all commits and tags | |
subcommand = "'git rm --cached --ignore-unmatch {}'".format(loc) | |
with open("output.tmp", "a") as fout: | |
# don't use pipe or it will deadlock when the buffer fills up | |
p = subprocess.Popen(' '.join(['git', 'filter-branch', '-f', | |
'--index-filter', subcommand, | |
'--tag-name-filter', 'cat', '--', | |
'--all']), | |
stdout=fout, stderr=fout, shell=True) | |
# result = p.communicate() # so buffer doesn't fill up | |
p.wait() | |
return p | |
def print_objects(objects): | |
fmt = "{sha} {size_kb:>8.0f} {pack_kb:>8.0f} {loc}" | |
print(" " * 37 + "sha size_kb pack_kb location") | |
for row in objects: | |
print(fmt.format(sha=row.sha, size_kb=int(row.size_kb), | |
pack_kb=(row.pack_kb), loc=row.location)) | |
def get_size(): | |
p = subprocess.Popen(["git", "count-objects", "-v"], stdout=PIPE) | |
output = p.communicate()[0].decode('ascii') | |
pack_size = re.search("(?<=size-pack: )\d+", output).group() | |
size = int(pack_size) / 1024 | |
return size | |
def ignore_missing(func, path, exc_info): | |
if exc_info[0] is FileNotFoundError: | |
pass | |
else: | |
raise exc_info | |
def git_ferocious(): | |
p1 = subprocess.Popen(" ".join(["git", "remote", "rm", "origin", "||", | |
"true"]), | |
stdout=PIPE, stdin=PIPE, shell=True) | |
p1.wait() | |
# git branch -D in || true | |
# cur_dir = os.path.abspath(os.path.dirname(__file__)) | |
try: | |
os.chdir(".git") | |
shutil.rmtree("refs/remotes/", onerror=ignore_missing) | |
shutil.rmtree("refs/original/", onerror=ignore_missing) | |
files = glob.glob("*_HEAD") | |
for f in files: | |
os.unlink(f) | |
shutil.rmtree("logs/", onerror=ignore_missing) | |
cmd = ("git " | |
"-c gc.reflogExpire=0 " | |
"-c gc.reflogExpireUnreachable=0 " | |
"-c gc.rerereresolved=0 " | |
"-c gc.rerereunresolved=0 " | |
"-c gc.pruneExpire=now " | |
"gc \"$@\"") | |
p4 = subprocess.Popen(cmd, shell=True, stdout=PIPE, stdin=PIPE) | |
p4.wait() | |
except Exception as exc: | |
raise exc | |
finally: | |
os.chdir('..') | |
def remove_backup(): | |
cmd = """ | |
git for-each-ref --format='%(refname)' refs/original | \ | |
while read ref | |
do | |
git update-ref -d "$ref" | |
done | |
""" | |
p = subprocess.Popen(cmd, shell=True, stdout=PIPE, stdin=PIPE) | |
p.wait() | |
return p | |
if __name__ == "__main__": | |
size = get_size() | |
print("The size of the repository before repacking " | |
"is {:5.2f} MB".format(size)) | |
# go ahead and pack everything up first | |
p = subprocess.Popen(['git', 'repack', '-ad']) | |
p.wait() | |
# this filtering strategy is dumb unless you know | |
# you've got a lot of cruft | |
# be smarter about what's in here if you want. | |
# filter on types, use a diff-tree, or list them by hand | |
objects = get_largest_n(25) | |
print_objects(objects) | |
to_filter = [] | |
for obj in objects: | |
remove = input('Remove {} (y/n) [y] ? '.format(obj.location)) | |
if not remove or remove == 'y': | |
to_filter.append(obj.location) | |
pause = input("About to run filter-branch. Last chance to backup.") | |
size = get_size() | |
print("The original size of the repository is {:5.2f} MB".format(size)) | |
for loc in to_filter: | |
print("Filtering {}".format(loc)) | |
filter_branch(loc) | |
# clear up the refs/original backups | |
remove_backup() | |
# # clean the reflog | |
p1 = subprocess.Popen(["git", "reflog", "expire", "--expire=0", "--all"], | |
stdout=PIPE, stdin=PIPE) | |
p1.wait() | |
# # repack | |
p2 = subprocess.Popen(["git", "repack", "-ad"], stdout=PIPE, stdin=PIPE) | |
p2.wait() | |
# # aggressively garbage collect | |
git_ferocious() | |
size = get_size() | |
print("The new size of the repository is {:5.2f} MB".format(size)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment