Created
January 17, 2011 21:23
-
-
Save hughdbrown/783517 to your computer and use it in GitHub Desktop.
Idea for a git-like backup program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python script to backup data in src to dst using sha1 hashes of the files | |
in a backing directory. | |
Hugh Brown | |
[email protected] | |
""" | |
from hashlib import sha1 | |
import os | |
import os.path | |
from shutil import copyfile | |
from subprocess import Popen | |
from sys import exit | |
SHA_LEN = 4 # Length of the SHA1 hash to use as directory prefix | |
def calc_sha(filename, size=10*1000*1000): | |
""" | |
Calculate the sha1 hash of a file. Read in blocks. | |
""" | |
sh = sha1() | |
with open(filename, "rb") as f: | |
while True: | |
s = f.read(size) | |
if not s: break | |
sh.update(s) | |
return sh.hexdigest() | |
def create_backed_file(filename, backing_root, sha): | |
""" | |
Copy a file to the calculated backing store location. | |
""" | |
dstdir = os.path.join(backing_root, sha[:SHA_LEN]) | |
if not os.path.exists(dstdir): | |
os.makedirs(dstdir) | |
dstname = os.path.join(dstdir, sha) | |
#print "Copy %s to %s" % (filename, dstname) | |
copyfile(filename, dstname) | |
return dstname | |
def calc_dstname(dst_root, filename): | |
# FIXME: this assumes relative paths on filename | |
return os.path.join(dst_root, filename) | |
def link_dstfile_to_shafile(dstfile, shafile): | |
dstdir = os.path.split(dstfile)[0] | |
if not os.path.exists(dstdir): | |
print "Create dir %s" % dstdir | |
os.makedirs(dstdir) | |
args = ["ln", "-sf", shafile, dstfile] | |
#print " ".join(args) | |
Popen(args) | |
def sha_file_system(src_iter, dst_root, backing_root): | |
for filename in src_iter: | |
if os.path.isfile(filename): | |
sha = calc_sha(filename) | |
backed_file = create_backed_file(filename, backing_root, sha) | |
dstfile = calc_dstname(dst_root, filename) | |
link_dstfile_to_shafile(dstfile, backed_file) | |
def walker(dir): | |
for root, _, files in os.walk(dir): | |
for filename in files: | |
yield os.path.join(root, filename) | |
def get_parser(): | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option("-r", "--recursive", action="store_true", default=False, dest="recursive", help="Recurse directories") | |
parser.add_option("-s", "--source", dest="source", default=".", help="Source location") | |
parser.add_option("-d", "--destination", dest="destination", default=".", help="Destination location") | |
parser.add_option("-b", "--backing", dest="backing", help="Root for backing store") | |
return parser | |
def get_options(parser): | |
from glob import glob | |
options, args = parser.parse_args() | |
if not options.backing: | |
print "Require backing_root argument" | |
exit(1) | |
else: | |
backing = os.path.expanduser(options.backing) | |
if not os.path.exists(backing): | |
print "Create dir %s" % backing | |
os.makedirs(backing) | |
else: | |
assert(os.path.isdir(backing)) | |
destination = os.path.expanduser(options.destination) | |
if options.recursive: | |
#source = os.path.expanduser(options.source) | |
source = options.source | |
fileiter = walker(source) | |
else: | |
fileiter = (filename for arg in args for filename in glob(arg)) | |
return fileiter, destination, backing | |
if __name__ == '__main__': | |
fileiter, destination, backing = get_options(get_parser()) | |
sha_file_system(fileiter, destination, backing) | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have the same problem with photos. I have a one liner I use to find exact duplicates: