Created
February 6, 2022 16:57
-
-
Save lihas/e7b102000afad3856c2c7efa68e0499d to your computer and use it in GitHub Desktop.
This python script compares files in two directories on their md5 hash. lists the files which have same md5, and the ones which don't. I use this to verify my file backups. The directory structures are different hence a direct folder comparison with tools like beyond compare (compare on contents) wont work.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a Python script. | |
#This python script compares files in two directories | |
# on their md5 hash. lists the files which have same md5, | |
# and the ones which don't. I use this to verify my file backups. | |
# The directory structures are different hence a direct folder | |
# comparison with tools like beyond compare (compare on contents) wont work. | |
import glob | |
import os | |
import hashlib | |
from datetime import datetime | |
dir1Path = "D:/" | |
dir2Path = "E:/LighroomLibrary" | |
# Press the green button in the gutter to run the script. | |
def ProcessPath(dirPath): | |
cwd = os.getcwd() | |
os.chdir(dirPath) | |
fileHashMap = {} | |
fileList = glob.glob("**", recursive=True) | |
for file in fileList: | |
if os.path.isfile(file): | |
fhandle = open(file, 'rb') | |
md5hash = hashlib.md5(fhandle.read()).hexdigest() | |
fhandle.close() | |
if md5hash in fileHashMap.keys(): | |
#print("duplicate", fileHashMap[md5hash], file) | |
fileHashMap[md5hash].append(file) | |
else: | |
fileHashMap[md5hash] = [file] | |
#print(fileHashMap) | |
os.chdir(cwd) | |
return fileHashMap | |
startTime = datetime.now() | |
print("startTime", startTime) | |
dir1hashMap = ProcessPath(dir1Path) | |
print("path1 processed", datetime.now() - startTime) | |
dir2hashMap = ProcessPath(dir2Path) | |
print("path2 processed", datetime.now() - startTime) | |
sameFiles = {} | |
dir1ExtraFiles = {} | |
dir2ExtraFiles = {} | |
for key in dir1hashMap: | |
if key in dir2hashMap.keys(): | |
sameFiles[key] = [] + dir1hashMap[key] + dir2hashMap[key] | |
else: | |
dir1ExtraFiles[key] = dir1hashMap[key] | |
for key in dir2hashMap: | |
if key in dir1hashMap.keys(): | |
pass | |
else: | |
dir2ExtraFiles[key] = dir2hashMap[key] | |
print("same files") | |
for key in sameFiles: | |
print(key, sameFiles[key]) | |
print("Dir1 extra files") | |
for key in dir1ExtraFiles: | |
print(key, dir1ExtraFiles[key]) | |
print("Dir2 extra files") | |
for key in dir2ExtraFiles: | |
print(key, dir2ExtraFiles[key]) | |
print("runTime", datetime.now() - startTime) | |
# See PyCharm help at https://www.jetbrains.com/help/pycharm/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment