Created
November 18, 2010 09:35
-
-
Save williame/704809 to your computer and use it in GitHub Desktop.
zip to folder delta calc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
to_mem = True # set to False if you want a actual zip file of the delta made (e.g. you want to recompress it with 7z) | |
import zipfile | |
import sys, os, cStringIO | |
if len(sys.argv) != 3: | |
print "Diffs a zip file against a folder, and computes a file-granularity delta" | |
print "usage: python diff1.py prev.zip newfolder" | |
print "e.g. is MRise_1.0.zip is in your current directory, and you have the 1.6 directory tree in a folder called MRise_1.6, then:" | |
print " python diff1.py MRise_1.0.zip MRise_1.6" | |
sys.exit(1) | |
prev_filename = sys.argv[1] | |
src_folder = sys.argv[2] | |
# the zip file used to calculate the compressed size of the new content | |
nbuf = cStringIO.StringIO() | |
delta_filename = "diff1.tmp.zip" | |
new = zipfile.ZipFile(nbuf if to_mem else delta_filename,"w",zipfile.ZIP_DEFLATED) | |
class FInfo: | |
"""a file on the filesystem""" | |
def __init__(self,filename,file_size): | |
self.filename = filename | |
self.file_size = file_size | |
self.new = True | |
self._body = None | |
self.renamed = [] | |
self.prev = None | |
self.dup = None | |
def body(self): | |
"""lazy loading of the bytes in the file from disk""" | |
if self._body is None: | |
self._body = file(os.path.join(src_folder,self.filename)).read() | |
return self._body | |
def __repr__(self): | |
return "%s %d"%(self.filename,self.file_size) | |
class Rename: | |
"""records a rename mapping""" | |
def __init__(self,f,p): | |
self.f = f | |
self.p = p | |
f.renamed.append(p) | |
f.new = False | |
f.prev = p | |
def __repr__(self): | |
return "(%s -> %s)"%(self.p.filename,self.f.filename) | |
prev = zipfile.ZipFile(prev_filename,"r") | |
pinfo = prev.infolist() # a list of all the files in the previous archive | |
pinfo = filter(lambda p: p.filename[-1] not in "/\\",pinfo) # filter out directories | |
finfo = {} # all the files in the new folder, indexed by file-name | |
sinfo = {} # all the files in the new folder, indexed by file size (useful to find renames) | |
# walk the file system and fill up finfo and sinfo | |
for (path, dirs, files) in os.walk(src_folder): | |
for f in files: | |
fullpath = os.path.join(path,f) | |
relpath = fullpath[len(src_folder)+1:] | |
size = os.stat(fullpath).st_size | |
f = finfo[relpath] = FInfo(relpath,size) | |
if size in sinfo: | |
sinfo[size].append(f) | |
else: | |
sinfo[size] = [f] | |
# calc the diff of pinfo vs finfo | |
deleted = [] | |
renamed = [] | |
changed = [] | |
changed_same_size = [] | |
unchanged = [] | |
for p in pinfo: | |
if p.filename not in finfo: | |
if p.file_size in sinfo: | |
# renamed? | |
found = False | |
for s in sinfo[p.file_size]: | |
pbody = prev.read(p.filename) | |
sbody = s.body() | |
if pbody == sbody: | |
renamed.append(Rename(s,p)) | |
found = True | |
break | |
if found: | |
continue | |
deleted.append(p) | |
else: | |
f = finfo[p.filename] | |
f.new = False | |
if p.file_size != f.file_size: | |
changed.append(p) | |
else: | |
pbody = prev.read(p.filename) | |
sbody = f.body() | |
if pbody != sbody: | |
changed.append(f) | |
changed_same_size.append(p) | |
print "changed same size:",p | |
else: | |
unchanged.append(p) | |
f.prev = p | |
# scan for dups in new file system | |
dups = [] | |
for s in sinfo.values(): | |
# not so many so don't try and be too clever | |
for i in xrange(len(s)): | |
if s[i].dup is not None: | |
continue | |
for j in xrange(i+1,len(s)): | |
if s[i].body() == s[j].body(): | |
print "new",s[i].filename,"dups",s[j] | |
s[j].dup = s[i] | |
dups.append(s[j]) | |
# for trivia, scan for dups in previous zip | |
psinfo = {} | |
for p in pinfo: | |
if p.file_size not in psinfo: | |
psinfo[p.file_size] = [] | |
psinfo[p.file_size].append(p) | |
pdups = [] | |
for s in psinfo.values(): | |
# not so many so don't try and be too clever | |
if len(s) < 2: | |
continue | |
bodies = [prev.read(p.filename) for p in s] | |
for i in xrange(len(s)): | |
if bodies[i] is None: | |
continue | |
for j in xrange(i+1,len(s)): | |
if bodies[j] is None: | |
continue | |
if bodies[i] == bodies[j]: | |
print "previous",s[i].filename,"dups",s[j].filename,s[j].compress_size,s[j].file_size | |
bodies[j] = None | |
pdups.append(s[j]) | |
# for those that are new or changed, work out the new compressed size | |
copied = 0 | |
added = [] | |
for f in finfo.values(): | |
if f.new: | |
added.append(f) | |
if f.dup is not None: | |
print "skipping dup",f | |
elif f.prev is None: | |
print "compressing",f | |
new.write(os.path.join(src_folder,f.filename)) | |
else: | |
copied += f.prev.compress_size | |
new.close() | |
delta = len(nbuf.getvalue()) if to_mem else os.stat(delta_filename).st_size | |
def sz(num): | |
"""pretty print a file size""" | |
for x in ['bytes','KB','MB','GB','TB']: | |
if num < 1024.0: | |
return "%3.1f %s" % (num, x) | |
num /= 1024.0 | |
# dump the stats | |
print "===" | |
print len(added),"added:",sz(sum([p.file_size for p in added])) | |
print len(renamed),"renamed:",sz(sum([p.f.file_size for p in renamed])) | |
print len(unchanged),"unchanged:",sz(sum([p.file_size for p in unchanged])),"->",sz(sum([p.compress_size for p in unchanged])) | |
print len(changed),"changed:",sz(sum([p.file_size for p in changed])),"(%d,%s)"%(len(changed_same_size),sz(sum([p.file_size for p in changed_same_size]))) | |
print len(deleted),"deleted:",sz(sum([p.file_size for p in deleted])),"->",sz(sum([p.compress_size for p in deleted])) | |
print len(pinfo),"previously:",sz(sum([p.file_size for p in pinfo])),"->",sz(os.stat(prev_filename).st_size) | |
print len(finfo),"now:",sz(sum([p.file_size for p in finfo.values()])),"->",sz(copied+delta) | |
print len(pdups),"dups in previous archive:",sz(sum([p.file_size for p in pdups])),"->",sz(sum([p.compress_size for p in pdups])) | |
print len(dups),"dups avoided in new archive:",sz(sum([p.file_size for p in dups])) | |
print "delta:",sz(delta) | |
print "(all zipping done to memory)" if to_mem else "(delta in %s)"%delta_filename |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
added a dup scanner