Created
March 22, 2022 07:15
-
-
Save lihas/3df58ca4a3379ebf0b7eda331014e494 to your computer and use it in GitHub Desktop.
Parallel version of https://gist.github.com/lihas/e7b102000afad3856c2c7efa68e0499d
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a Python script. | |
# Same as previous version except with thread worker | |
# for one particular run the previous script took 21 mins, while this one took 14 mins | |
# This python script compares files in two directories | |
# on their md5 hash. lists the files which have same md5, | |
# and the ones which don't. I use this to verify my file backups. | |
# The directory structures are different hence a direct folder | |
# comparison with tools like beyond compare (compare on contents) wont work. | |
import glob | |
import os | |
import hashlib | |
from datetime import datetime | |
import concurrent.futures | |
dir1Path = "D:/DCIM/100CANON" | |
dir2Path = "E:/LighroomLibrary" | |
# Press the green button in the gutter to run the script. | |
def GetFileHash(filePath): | |
if os.path.isfile(filePath): | |
try: | |
fhandle = open(filePath, 'rb') | |
md5hash = hashlib.md5(fhandle.read()).hexdigest() | |
fhandle.close() | |
return {"hash": md5hash, "path": filePath, "isFile":True} | |
except: | |
pass | |
else: | |
return {"hash": None, "path": None, "isFile": True} | |
def ProcessPath(dirPath, dirFutures): | |
cwd = os.getcwd() | |
os.chdir(dirPath) | |
fileList = glob.glob("**", recursive=True) | |
os.chdir(cwd) | |
for file in fileList: | |
file_path = dirPath + '/' + file | |
fut = concurrent.futures.ThreadPoolExecutor().submit(GetFileHash, file_path) | |
dirFutures.append(fut) | |
def ProcessFutures(futures): | |
fileHashMap = {} | |
for fut in concurrent.futures.as_completed(futures): | |
res = fut.result() | |
if res["isFile"]: | |
filePath = res["path"] | |
md5hash = res["hash"] | |
if md5hash in fileHashMap.keys(): | |
fileHashMap[md5hash].append(filePath) | |
else: | |
fileHashMap[md5hash] = [filePath] | |
return fileHashMap | |
startTime = datetime.now() | |
print("startTime", startTime) | |
dir1Futures = [] | |
ProcessPath(dir1Path, dir1Futures) | |
print("path1 processed", datetime.now()) | |
dir2Futures = [] | |
ProcessPath(dir2Path, dir2Futures) | |
print("path2 processed", datetime.now()) | |
print("processing dir 1 futures", datetime.now()) | |
dir1hashMap = ProcessFutures(dir1Futures) | |
print("processing dir 2 futures", datetime.now()) | |
dir2hashMap = ProcessFutures(dir2Futures) | |
print("done processing futures", datetime.now()) | |
sameFiles = {} | |
dir1ExtraFiles = {} | |
dir2ExtraFiles = {} | |
for key in dir1hashMap: | |
if key in dir2hashMap.keys(): | |
sameFiles[key] = [] + dir1hashMap[key] + dir2hashMap[key] | |
else: | |
dir1ExtraFiles[key] = dir1hashMap[key] | |
for key in dir2hashMap: | |
if key in dir1hashMap.keys(): | |
pass | |
else: | |
dir2ExtraFiles[key] = dir2hashMap[key] | |
print("same files") | |
for key in sameFiles: | |
print(key, sameFiles[key]) | |
print("Dir1 extra files") | |
for key in dir1ExtraFiles: | |
print(key, dir1ExtraFiles[key]) | |
print("Dir2 extra files") | |
for key in dir2ExtraFiles: | |
print(key, dir2ExtraFiles[key]) | |
print("runTime", datetime.now() - startTime) | |
# See PyCharm help at https://www.jetbrains.com/help/pycharm/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment