Created
August 23, 2013 19:49
-
-
Save almarklein/6323266 to your computer and use it in GitHub Desktop.
Some tools for analyzing the files (also deleted files) in the repo and removing them completely from history.
Warning: black magic is involved, and the commit hashes will be changed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Functionality for cleaning up your git repository. | |
This uses commands found on: | |
https://help.github.com/articles/remove-sensitive-data | |
""" | |
import os | |
import sys | |
import subprocess | |
def call(cmd): | |
subprocess.call(cmd, shell=True) | |
def remove(path): | |
""" Remove the given path (file or directory) from the history | |
of this repository. | |
""" | |
command = ("git filter-branch --force --index-filter " + | |
"'git rm -r --cached --ignore-unmatch %s' " + | |
"--prune-empty --tag-name-filter cat -- --all") | |
call(command%path) | |
def clean(): | |
""" Clean the backup branch that was created during the remove step. | |
""" | |
call("git update-ref -d refs/original/refs/heads/master") | |
call("git reflog expire --expire=now --all") | |
call("git gc --prune=now") | |
def deleted(): | |
""" Get a list of all deleted objects know by the repo. | |
""" | |
subprocess.call("git log --diff-filter=D --summary | grep delete", shell=True) | |
def all(): | |
""" Get a list of all files that were ever in the repo. | |
""" | |
subprocess.call("git rev-list --objects --all | sort -k 2 | cut -f 2 -d\ | uniq", shell=True) | |
def sizes(maxcount=20): | |
""" Get a list of the N biggest files in the repo (and show their size in bytes). | |
""" | |
# Clear | |
for fname in ['bigtosmall.txt', 'allfileshas.txt', 'bigobjects.txt']: | |
if os.path.isfile(fname): | |
os.remove(fname) | |
# Git commands | |
call("git rev-list --objects --all | sort -k 2 > allfileshas.txt") | |
call('git gc && git verify-pack -v .git/objects/pack/pack-*.idx | egrep "^\w+ blob\W+[0-9]+ [0-9]+ [0-9]+$" | sort -k 3 -n -r > bigobjects.txt') | |
call("for SHA in `cut -f 1 -d\ < bigobjects.txt`; do\necho $(grep $SHA bigobjects.txt) $(grep $SHA allfileshas.txt) | awk '{print $1,$3,$7}' >> bigtosmall.txt\ndone;") | |
# Show results | |
with open('bigtosmall.txt', 'r') as f: | |
count = 0 | |
for line in f.readlines(): | |
count += 1 | |
print(line.strip().split(' ',1)[1]) | |
if count >= maxcount: | |
return | |
if __name__ == '__main__': | |
if sys.argv[1] == 'remove': | |
remove(sys.argv[2]) | |
elif sys.argv[1] == 'clean': | |
clean() | |
elif sys.argv[1] == 'deleted': | |
deleted() | |
elif sys.argv[1] == 'all': | |
all() | |
elif sys.argv[1] == 'sizes': | |
if len(sys.argv) > 2: | |
sizes(int(sys.argv[2])) | |
else: | |
sizes() | |
else: | |
print('Invalid command') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment