Created
October 4, 2021 11:07
-
-
Save InputBlackBoxOutput/5ecc992ee22e70987e1ff36f5ac5ec68 to your computer and use it in GitHub Desktop.
Remove duplicate files in a directory identified by comparing SHA256 hash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
import glob | |
# Get the SHA-256 hash of a file | |
def sha256(fname, size=4096): | |
sha256_hash = hashlib.sha256() | |
with open(fname, 'rb') as f: | |
for byte_block in iter(lambda: f.read(4096), b""): | |
sha256_hash.update(byte_block) | |
return sha256_hash.hexdigest() | |
# Find difference between files using SHA-256 and remove duplicates | |
def remove_duplicate(path): | |
filelist = glob.glob(path) | |
# print(filelist) | |
unique = [] | |
for file in filelist: | |
filehash = sha256(file) | |
if filehash not in unique: | |
unique.append(filehash) | |
else: | |
print(f"Removing file: {file}") | |
os.remove(file) | |
if __name__ == "__main__": | |
remove_duplicate(path="images/*.jpg") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment