Skip to content

Instantly share code, notes, and snippets.

@av1d
Created January 12, 2024 17:30
Show Gist options
  • Save av1d/411eaeae4dc8c62a6d507b714f6b5029 to your computer and use it in GitHub Desktop.
Save av1d/411eaeae4dc8c62a6d507b714f6b5029 to your computer and use it in GitHub Desktop.
Scan directories recursively, create checksum of each file. Compare checksums, copy differing files to another folder while maintaining directory heirarchy.
import os
import hashlib
import shutil
def get_file_md5(filename):
"""Calculate the MD5 hash for a file."""
md5_hash = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()
def scan_directory(directory, output_directory):
"""Recursively scan a directory and copy the mismatched files
to another directory while retaining their hierarchy."""
md5_dict = {}
mismatched_files = []
file_count = 0
for root, dirs, files in os.walk(directory):
for file in files:
filepath = os.path.join(root, file)
md5 = get_file_md5(filepath)
if md5 in md5_dict:
mismatched_files.extend([filepath, md5_dict[md5]])
else:
md5_dict[md5] = filepath
file_count += 1
for i in range(0, len(mismatched_files), 2):
source_file = mismatched_files[i]
target_file = os.path.join(output_directory, os.path.relpath(source_file, directory))
target_dir = os.path.dirname(target_file)
os.makedirs(target_dir, exist_ok=True)
shutil.copy2(source_file, target_file)
return mismatched_files, file_count
# Usage example
directory = "/home/av1d/files"
output_directory = "/home/av1d/unique_files"
mismatched_files, file_count = scan_directory(directory, output_directory)
print(f"There are {file_count} files in the directory.")
print(f"There are {len(mismatched_files) // 2} mismatched files.")
print(f"Mismatched files have been copied to {output_directory}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment