Created
January 12, 2024 17:30
-
-
Save av1d/411eaeae4dc8c62a6d507b714f6b5029 to your computer and use it in GitHub Desktop.
Scan directories recursively, create checksum of each file. Compare checksums, copy differing files to another folder while maintaining directory heirarchy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
import shutil | |
def get_file_md5(filename): | |
"""Calculate the MD5 hash for a file.""" | |
md5_hash = hashlib.md5() | |
with open(filename, "rb") as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
md5_hash.update(chunk) | |
return md5_hash.hexdigest() | |
def scan_directory(directory, output_directory): | |
"""Recursively scan a directory and copy the mismatched files | |
to another directory while retaining their hierarchy.""" | |
md5_dict = {} | |
mismatched_files = [] | |
file_count = 0 | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
filepath = os.path.join(root, file) | |
md5 = get_file_md5(filepath) | |
if md5 in md5_dict: | |
mismatched_files.extend([filepath, md5_dict[md5]]) | |
else: | |
md5_dict[md5] = filepath | |
file_count += 1 | |
for i in range(0, len(mismatched_files), 2): | |
source_file = mismatched_files[i] | |
target_file = os.path.join(output_directory, os.path.relpath(source_file, directory)) | |
target_dir = os.path.dirname(target_file) | |
os.makedirs(target_dir, exist_ok=True) | |
shutil.copy2(source_file, target_file) | |
return mismatched_files, file_count | |
# Usage example | |
directory = "/home/av1d/files" | |
output_directory = "/home/av1d/unique_files" | |
mismatched_files, file_count = scan_directory(directory, output_directory) | |
print(f"There are {file_count} files in the directory.") | |
print(f"There are {len(mismatched_files) // 2} mismatched files.") | |
print(f"Mismatched files have been copied to {output_directory}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment