Skip to content

Instantly share code, notes, and snippets.

@silicontrip
Created January 6, 2018 02:12
Show Gist options
  • Select an option

  • Save silicontrip/f07ef78c2516943b1b1b0b424e080176 to your computer and use it in GitHub Desktop.

Select an option

Save silicontrip/f07ef78c2516943b1b1b0b424e080176 to your computer and use it in GitHub Desktop.
copy utility that checks that the file doesn't exist within the destination directory tree
#!/usr/bin/python
import sys
import os
import json
import shutil
import hashlib
import atexit
from threading import Thread
def writerepodb():
with open(repofile, "w") as rf:
rf.write(json.dumps(repodb))
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def findsize(size,repodb):
mk=[]
for fn in repodb:
if size == repodb[fn]['st_size']:
mk.append(fn)
return mk
def printusage():
print ("repo_cp.py <SRC DIR> <DST DIR>")
if not sys.argv.__len__() == 3:
print "ERROR: incorrect number of arguments"
printusage()
sys.exit()
src = sys.argv[1]
dst = sys.argv[2]
if not (os.path.isdir (src) or os.path.isdir(dst)):
print "ERROR: SRC or DST is not a directory"
printusage()
sys.exit()
repofile = dst + os.sep + ".repo_db"
repodb={}
if os.path.exists(repofile):
repodb = json.load(open(repofile))
else:
print "indexing..."
for root, dirs, files in os.walk(dst):
for fn in files:
pn = root + os.sep + fn
st = os.stat(pn)
pk = pn.replace(dst+os.sep,'')
repodb[pk]={"st_size":st.st_size}
repothread = Thread(target=writerepodb)
repothread.start()
repothread.join()
atexit.register(writerepodb)
total = 0
for root, dirs, files in os.walk(src):
total += files.__len__()
count =0
for root, dirs,files in os.walk(src):
for fn in files:
count +=1
pn = root + os.sep + fn
st = os.stat(pn)
pk = pn.replace(src+os.sep,'')
if pk in repodb:
srcmd5 = md5(pn)
if 'md5' not in repodb[pk]:
repodb[pk]['md5'] = md5(dst + os.sep + pk)
#repothread = Thread(target=writerepodb)
#repothread.start()
#repothread.join()
if srcmd5 == repodb[pk]['md5'] and st.st_size == repodb[pk]['st_size']:
print str(count) + '/' + str(total) + ' exists: ' + pk
pass
else:
print str(count) +'/' + str(total) + ' changed: ' + pk
# what to do here
else:
mp = findsize(st.st_size,repodb)
if mp.__len__() > 0:
srcmd5 = md5(pn)
match=False
for dn in mp:
if 'md5' not in repodb[dn]:
repodb[dn]['md5'] = md5(dst + os.sep + dn)
#repothread = Thread(target=writerepodb)
#repothread.start()
#repothread.join()
if srcmd5 == repodb[dn]['md5']:
print str(count) + '/' + str(total) + " found different path: " + pk + " <=> " + dn
match= True
break
if not match:
print str(count) + '/' + str(total) + " new: " + pk
shutil.copy2(src + os.path.sep + pk, dst + os.path.sep + pk)
repodb[pk]={'md5':srcmd5,'st_size':st.st_size}
#repothread = Thread(target=writerepodb)
#repothread.start()
#repothread.join()
else:
print str(count) + '/' + str(total) + " new: " + pk
shutil.copy2(src + os.path.sep + pk, dst + os.path.sep + pk)
repodb[pk]={'st_size':st.st_size}
#repothread = Thread(target=writerepodb)
#repothread.start()
#repothread.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment