Last active
January 23, 2024 19:33
-
-
Save foru17/df1e8f0de9e532d91e4f4a085ce11cd2 to your computer and use it in GitHub Desktop.
remove duplicates file witch hash check
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import hashlib | |
# 设定工作目录 | |
DIR = "/Volumes/Desktop" | |
def calculate_hash(filepath): | |
hasher = hashlib.sha256() | |
with open(filepath, 'rb') as f: | |
while True: | |
data = f.read(65536) # read in 64k chunks | |
if not data: | |
break | |
hasher.update(data) | |
return hasher.hexdigest() | |
# 简单过滤例如 file(1).jpg file(2).png 之类的重复文件, hash 校验是否相同 | |
def remove_duplicates(root_dir): | |
pattern = re.compile(r"(.*) \(\d+\)(\..*)?$") | |
for dirpath, dirnames, filenames in os.walk(root_dir): | |
for filename in filenames: | |
match = pattern.match(filename) | |
# print(f"match {filename}") # Fix the indentation and f-string format here | |
if match: | |
original_file = match.group(1) + (match.group(2) or "") | |
original_filepath = os.path.join(dirpath, original_file) | |
duplicate_filepath = os.path.join(dirpath, filename) | |
if os.path.exists(original_filepath): | |
original_hash = calculate_hash(original_filepath) | |
duplicate_hash = calculate_hash(duplicate_filepath) | |
if original_hash == duplicate_hash: | |
os.remove(duplicate_filepath) | |
print(f"Removed {duplicate_filepath}") | |
remove_duplicates(DIR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment