Last active
February 12, 2025 06:16
-
-
Save yeiichi/039953505f5d38c212a5023a12957cc2 to your computer and use it in GitHub Desktop.
Class for detecting duplicate files based on their hash values.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
from collections import Counter | |
from datetime import datetime | |
from pathlib import Path | |
from list_files_sha256 import hash_by_256, list_fpaths_in_dir | |
class FindDupFile: | |
""" | |
Class for detecting duplicate files based on their hash values. | |
This class scans a specified directory, computes the SHA-256 hash for each file, and | |
provides methods to identify duplicate files, extract file names with specific hash | |
values, and generate a report of duplicates. | |
Attributes: | |
name_hash_dict (dict): A dictionary where keys are file names (str), and values | |
are their corresponding SHA-256 hash values (str), generated from the target | |
directory. | |
""" | |
def __init__(self, directory): | |
""" | |
Initializes an instance with a dictionary that maps file names to their SHA-256 hash values. | |
This constructor scans the target directory, identifies all file paths within the directory, and | |
computes the SHA-256 hash for each file. It stores these mappings in an instance attribute | |
`name_hash_dict`. | |
Args: | |
directory (str): The path to the directory whose files will be scanned and hashed. | |
""" | |
fpaths = list_fpaths_in_dir(directory) | |
name_hash_dict = {} | |
for each_path in fpaths: | |
name_hash_dict.update({each_path.name: hash_by_256(each_path)}) | |
self.name_hash_dict = name_hash_dict | |
def find_duplicate_hash(self): | |
""" | |
Identifies duplicate hash values and their occurrence counts. | |
This method processes the hash values present in the `name_hash_dict` attribute, counts their occurrences | |
using a `Counter`, and identifies which hash values appear more than once. It then creates and returns | |
a dictionary mapping these duplicate hash values to their counts. | |
Returns: | |
dict: A dictionary where keys are duplicate hash values (str) and values are their counts (int). | |
""" | |
tallied_hashes = Counter(self.name_hash_dict.values()) | |
duplicate_hash_dict = {} | |
for hash_digest, hash_count in tallied_hashes.items(): | |
if hash_count > 1: | |
duplicate_hash_dict.update({hash_digest: hash_count}) | |
return duplicate_hash_dict | |
def extract_files_w_hash(self, hash_digest): | |
""" | |
Extracts the list of file names with a matching hash value from an internal dictionary. | |
This method iterates through a dictionary of file names and their corresponding | |
hash values, identifies the file names whose hash values match the provided | |
`hash_digest`, and returns them in a list. | |
Args: | |
hash_digest: The hash value to be matched against the stored hash values. | |
Returns: | |
list: A list of file names (str) whose hash values match the given | |
`hash_digest`. | |
""" | |
duplicated_files = [] | |
for name_, hash_ in self.name_hash_dict.items(): | |
if hash_ == hash_digest: | |
duplicated_files.append(name_) | |
return duplicated_files | |
def create_report(self): | |
duplicate_hash_dict = self.find_duplicate_hash() | |
hash_name_tbl = [['sha256', 'file_name']] | |
for duplicate_hash in duplicate_hash_dict: | |
duplicated_file_names = self.extract_files_w_hash(duplicate_hash) | |
for duplicated_file_name in duplicated_file_names: | |
hash_name_tbl.append([duplicate_hash, duplicated_file_name]) | |
return hash_name_tbl | |
def save_as_csv(array_in, save_path): | |
with open(save_path, 'w') as f: | |
csv_writer = csv.writer(f) | |
csv_writer.writerows(array_in) | |
print(f'\033[93mSaved: {save_path}\033[0m') | |
if __name__ == '__main__': | |
# Initialize an instance. | |
target_dir = Path(input('\033[93mDIR? >> \033[0m').strip()) | |
if not Path(target_dir).is_dir(): | |
raise ValueError(f'\033[mInvalid directory: {target_dir}\033[0m') | |
instance = FindDupFile(str(target_dir)) | |
# Create a Table of duplicated files | |
duplicates_tbl = instance.create_report() | |
# Save the Table as a CSV file. | |
dttm = datetime.now().strftime('%Y%m%d_%H%M') | |
fname = f'DUPLICATES_{target_dir.name}_{dttm}.csv' | |
fpath = target_dir.parent / fname | |
save_as_csv(duplicates_tbl, str(fpath)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment