Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Created January 17, 2011 21:23
Show Gist options
  • Save hughdbrown/783517 to your computer and use it in GitHub Desktop.
Save hughdbrown/783517 to your computer and use it in GitHub Desktop.
Idea for a git-like backup program
"""
Python script to backup data in src to dst using sha1 hashes of the files
in a backing directory.
Hugh Brown
[email protected]
"""
from hashlib import sha1
import os
import os.path
from shutil import copyfile
from subprocess import Popen
from sys import exit
SHA_LEN = 4 # Length of the SHA1 hash to use as directory prefix
def calc_sha(filename, size=10*1000*1000):
"""
Calculate the sha1 hash of a file. Read in blocks.
"""
sh = sha1()
with open(filename, "rb") as f:
while True:
s = f.read(size)
if not s: break
sh.update(s)
return sh.hexdigest()
def create_backed_file(filename, backing_root, sha):
"""
Copy a file to the calculated backing store location.
"""
dstdir = os.path.join(backing_root, sha[:SHA_LEN])
if not os.path.exists(dstdir):
os.makedirs(dstdir)
dstname = os.path.join(dstdir, sha)
#print "Copy %s to %s" % (filename, dstname)
copyfile(filename, dstname)
return dstname
def calc_dstname(dst_root, filename):
# FIXME: this assumes relative paths on filename
return os.path.join(dst_root, filename)
def link_dstfile_to_shafile(dstfile, shafile):
dstdir = os.path.split(dstfile)[0]
if not os.path.exists(dstdir):
print "Create dir %s" % dstdir
os.makedirs(dstdir)
args = ["ln", "-sf", shafile, dstfile]
#print " ".join(args)
Popen(args)
def sha_file_system(src_iter, dst_root, backing_root):
for filename in src_iter:
if os.path.isfile(filename):
sha = calc_sha(filename)
backed_file = create_backed_file(filename, backing_root, sha)
dstfile = calc_dstname(dst_root, filename)
link_dstfile_to_shafile(dstfile, backed_file)
def walker(dir):
for root, _, files in os.walk(dir):
for filename in files:
yield os.path.join(root, filename)
def get_parser():
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-r", "--recursive", action="store_true", default=False, dest="recursive", help="Recurse directories")
parser.add_option("-s", "--source", dest="source", default=".", help="Source location")
parser.add_option("-d", "--destination", dest="destination", default=".", help="Destination location")
parser.add_option("-b", "--backing", dest="backing", help="Root for backing store")
return parser
def get_options(parser):
from glob import glob
options, args = parser.parse_args()
if not options.backing:
print "Require backing_root argument"
exit(1)
else:
backing = os.path.expanduser(options.backing)
if not os.path.exists(backing):
print "Create dir %s" % backing
os.makedirs(backing)
else:
assert(os.path.isdir(backing))
destination = os.path.expanduser(options.destination)
if options.recursive:
#source = os.path.expanduser(options.source)
source = options.source
fileiter = walker(source)
else:
fileiter = (filename for arg in args for filename in glob(arg))
return fileiter, destination, backing
if __name__ == '__main__':
fileiter, destination, backing = get_options(get_parser())
sha_file_system(fileiter, destination, backing)
exit(0)
@brianegge
Copy link

I have the same problem with photos. I have a one liner I use to find exact duplicates:

find . -type f -print0 | xargs -0 -n1 md5sum | sort -k 1,32 | guniq -w 32 -d --all-repeated=separate | sed -e 's/^[0-9a-f]*\ *//;'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment