Skip to content

Instantly share code, notes, and snippets.

@foru17
Last active January 23, 2024 19:33
Show Gist options
  • Save foru17/df1e8f0de9e532d91e4f4a085ce11cd2 to your computer and use it in GitHub Desktop.
Save foru17/df1e8f0de9e532d91e4f4a085ce11cd2 to your computer and use it in GitHub Desktop.
remove duplicates file witch hash check
import os
import re
import hashlib
# 设定工作目录
DIR = "/Volumes/Desktop"
def calculate_hash(filepath):
hasher = hashlib.sha256()
with open(filepath, 'rb') as f:
while True:
data = f.read(65536) # read in 64k chunks
if not data:
break
hasher.update(data)
return hasher.hexdigest()
# 简单过滤例如 file(1).jpg file(2).png 之类的重复文件, hash 校验是否相同
def remove_duplicates(root_dir):
pattern = re.compile(r"(.*) \(\d+\)(\..*)?$")
for dirpath, dirnames, filenames in os.walk(root_dir):
for filename in filenames:
match = pattern.match(filename)
# print(f"match {filename}") # Fix the indentation and f-string format here
if match:
original_file = match.group(1) + (match.group(2) or "")
original_filepath = os.path.join(dirpath, original_file)
duplicate_filepath = os.path.join(dirpath, filename)
if os.path.exists(original_filepath):
original_hash = calculate_hash(original_filepath)
duplicate_hash = calculate_hash(duplicate_filepath)
if original_hash == duplicate_hash:
os.remove(duplicate_filepath)
print(f"Removed {duplicate_filepath}")
remove_duplicates(DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment