Created
November 6, 2020 23:04
-
-
Save abaines/d9f3ff96a11f432440542d66d0144ab6 to your computer and use it in GitHub Desktop.
Search for duplicate files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import hashlib | |
print('cwd',os.getcwd()) | |
def hashFile(f): | |
return hashlib.sha224(open(f,'rb').read()).hexdigest() | |
def searchFolder(folder): | |
fileNames = [f for f in os.listdir(folder) if os.path.isfile(os.path.abspath(os.path.join(folder,f)))] | |
hashes = {} | |
for fileName in fileNames: | |
fullFileName = os.path.abspath(os.path.join(folder,fileName)) | |
print(fullFileName) | |
if (".jar" not in fullFileName): | |
continue | |
h = hashFile(fullFileName) | |
if h not in hashes: | |
hashes[h] = [] | |
hashes[h].append(fullFileName) | |
for k,vals in hashes.items(): | |
if len(vals)>1: | |
print(h) | |
for v in vals: | |
print (v) | |
print() | |
def findDiffs(path): | |
dmap = {} | |
for root, directories, filenames in os.walk(path): | |
for directory in directories: | |
pass | |
#print os.path.join(root, directory) | |
for filename in filenames: | |
fullName = os.path.join(root,filename) | |
if fullName.endswith(".jar"): | |
#print (fullName) | |
hash = hashFile(fullName) | |
#print(hash) | |
if filename not in dmap: | |
dmap[filename] = {} | |
if hash not in dmap[filename]: | |
dmap[filename][hash] = [] | |
dmap[filename][hash].append(fullName) | |
for key in dmap: | |
l = len(dmap[key]) | |
if l != 1: | |
print(key + " " + str(l)) | |
for k2 in dmap[key]: | |
a = dmap[key][k2] | |
print(" " + k2 + " " + str(len(a))) | |
for el in a: | |
print(" " + el) | |
def findDups(path): | |
hashDict = {} | |
for root, directories, filenames in os.walk(path): | |
for filename in filenames: | |
fullName = os.path.join(root,filename) | |
hash = hashFile(fullName) | |
if hash not in hashDict: | |
hashDict[hash] = [] | |
hashDict[hash].append(fullName) | |
for key in hashDict: | |
l = len( hashDict[key] ) | |
if l>1: | |
print(l,hashDict[key]) | |
a = hashDict[key] | |
print(a[0]) | |
os.rename(a[0],a[0]+"__") | |
#print(hashDict) | |
findDups("E:\\Mosaic Images") | |
print("Complete") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment