Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Created January 17, 2011 21:23
Show Gist options
  • Save hughdbrown/783517 to your computer and use it in GitHub Desktop.
Save hughdbrown/783517 to your computer and use it in GitHub Desktop.
Idea for a git-like backup program
"""
Python script to backup data in src to dst using sha1 hashes of the files
in a backing directory.
Hugh Brown
[email protected]
"""
from hashlib import sha1
import os
import os.path
from shutil import copyfile
from subprocess import Popen
from sys import exit
SHA_LEN = 4 # Length of the SHA1 hash to use as directory prefix
def calc_sha(filename, size=10*1000*1000):
"""
Calculate the sha1 hash of a file. Read in blocks.
"""
sh = sha1()
with open(filename, "rb") as f:
while True:
s = f.read(size)
if not s: break
sh.update(s)
return sh.hexdigest()
def create_backed_file(filename, backing_root, sha):
"""
Copy a file to the calculated backing store location.
"""
dstdir = os.path.join(backing_root, sha[:SHA_LEN])
if not os.path.exists(dstdir):
os.makedirs(dstdir)
dstname = os.path.join(dstdir, sha)
#print "Copy %s to %s" % (filename, dstname)
copyfile(filename, dstname)
return dstname
def calc_dstname(dst_root, filename):
# FIXME: this assumes relative paths on filename
return os.path.join(dst_root, filename)
def link_dstfile_to_shafile(dstfile, shafile):
dstdir = os.path.split(dstfile)[0]
if not os.path.exists(dstdir):
print "Create dir %s" % dstdir
os.makedirs(dstdir)
args = ["ln", "-sf", shafile, dstfile]
#print " ".join(args)
Popen(args)
def sha_file_system(src_iter, dst_root, backing_root):
for filename in src_iter:
if os.path.isfile(filename):
sha = calc_sha(filename)
backed_file = create_backed_file(filename, backing_root, sha)
dstfile = calc_dstname(dst_root, filename)
link_dstfile_to_shafile(dstfile, backed_file)
def walker(dir):
for root, _, files in os.walk(dir):
for filename in files:
yield os.path.join(root, filename)
def get_parser():
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-r", "--recursive", action="store_true", default=False, dest="recursive", help="Recurse directories")
parser.add_option("-s", "--source", dest="source", default=".", help="Source location")
parser.add_option("-d", "--destination", dest="destination", default=".", help="Destination location")
parser.add_option("-b", "--backing", dest="backing", help="Root for backing store")
return parser
def get_options(parser):
from glob import glob
options, args = parser.parse_args()
if not options.backing:
print "Require backing_root argument"
exit(1)
else:
backing = os.path.expanduser(options.backing)
if not os.path.exists(backing):
print "Create dir %s" % backing
os.makedirs(backing)
else:
assert(os.path.isdir(backing))
destination = os.path.expanduser(options.destination)
if options.recursive:
#source = os.path.expanduser(options.source)
source = options.source
fileiter = walker(source)
else:
fileiter = (filename for arg in args for filename in glob(arg))
return fileiter, destination, backing
if __name__ == '__main__':
fileiter, destination, backing = get_options(get_parser())
sha_file_system(fileiter, destination, backing)
exit(0)
@hughdbrown
Copy link
Author

I have lots of digital photos and music files and videos that I'd love to copy over to my server and have all the duplicates folded. This is an outline of an idea for doing this.

It's not quite production-ready. It does not take enough care to ensure that the source file does not get overwritten with a link to the sha file.

@brianegge
Copy link

I have the same problem with photos. I have a one liner I use to find exact duplicates:

find . -type f -print0 | xargs -0 -n1 md5sum | sort -k 1,32 | guniq -w 32 -d --all-repeated=separate | sed -e 's/^[0-9a-f]*\ *//;'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment