yeiichi · February 12, 2025 06:16
diff --git a/find_dup_files.py b/find_dup_files.py
 #!/usr/bin/env python3
 import csv
 from collections import Counter
 from datetime import datetime
 from pathlib import Path

 from list_files_sha256 import hash_by_256, list_fpaths_in_dir


 class FindDupFile:
    """
    Class for detecting duplicate files based on their hash values.

    This class scans a specified directory, computes the SHA-256 hash for each file, and
    provides methods to identify duplicate files, extract file names with specific hash
    values, and generate a report of duplicates.

    Attributes:
        name_hash_dict (dict): A dictionary where keys are file names (str), and values
        are their corresponding SHA-256 hash values (str), generated from the target
        directory.
    """

    def __init__(self, directory):
        """
        Initializes an instance with a dictionary that maps file names to their SHA-256 hash values.

        This constructor scans the target directory, identifies all file paths within the directory, and
        computes the SHA-256 hash for each file. It stores these mappings in an instance attribute
        `name_hash_dict`.

        Args:
            directory (str): The path to the directory whose files will be scanned and hashed.
        """
        fpaths = list_fpaths_in_dir(directory)
        name_hash_dict = {}
        for each_path in fpaths:
            name_hash_dict.update({each_path.name: hash_by_256(each_path)})
        self.name_hash_dict = name_hash_dict

    def find_duplicate_hash(self):
        """
        Identifies duplicate hash values and their occurrence counts.

        This method processes the hash values present in the `name_hash_dict` attribute, counts their occurrences
        using a `Counter`, and identifies which hash values appear more than once. It then creates and returns
        a dictionary mapping these duplicate hash values to their counts.

        Returns:
            dict: A dictionary where keys are duplicate hash values (str) and values are their counts (int).
        """
        tallied_hashes = Counter(self.name_hash_dict.values())
        duplicate_hash_dict = {}
        for hash_digest, hash_count in tallied_hashes.items():
            if hash_count > 1:
                duplicate_hash_dict.update({hash_digest: hash_count})
        return duplicate_hash_dict

    def extract_files_w_hash(self, hash_digest):
        """
        Extracts the list of file names with a matching hash value from an internal dictionary.

        This method iterates through a dictionary of file names and their corresponding
        hash values, identifies the file names whose hash values match the provided
        `hash_digest`, and returns them in a list.

        Args:
            hash_digest: The hash value to be matched against the stored hash values.

        Returns:
            list: A list of file names (str) whose hash values match the given
            `hash_digest`.
        """
        duplicated_files = []
        for name_, hash_ in self.name_hash_dict.items():
            if hash_ == hash_digest:
                duplicated_files.append(name_)
        return duplicated_files

    def create_report(self):
        duplicate_hash_dict = self.find_duplicate_hash()
        hash_name_tbl = [['sha256', 'file_name']]
        for duplicate_hash in duplicate_hash_dict:
            duplicated_file_names = self.extract_files_w_hash(duplicate_hash)
            for duplicated_file_name in duplicated_file_names:
                hash_name_tbl.append([duplicate_hash, duplicated_file_name])
        return hash_name_tbl


 def save_as_csv(array_in, save_path):
    with open(save_path, 'w') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerows(array_in)
    print(f'\033[93mSaved: {save_path}\033[0m')


 if __name__ == '__main__':
    # Initialize an instance.
    target_dir = Path(input('\033[93mDIR? >> \033[0m').strip())
    if not Path(target_dir).is_dir():
        raise ValueError(f'\033[mInvalid directory: {target_dir}\033[0m')
    instance = FindDupFile(str(target_dir))
    # Create a Table of duplicated files
    duplicates_tbl = instance.create_report()
    # Save the Table as a CSV file.
    dttm = datetime.now().strftime('%Y%m%d_%H%M')
    fname = f'DUPLICATES_{target_dir.name}_{dttm}.csv'
    fpath = target_dir.parent / fname
    save_as_csv(duplicates_tbl, str(fpath))
	#!/usr/bin/env python3
	import csv
	from collections import Counter
	from datetime import datetime
	from pathlib import Path

	from list_files_sha256 import hash_by_256, list_fpaths_in_dir


	class FindDupFile:
	"""
	Class for detecting duplicate files based on their hash values.

	This class scans a specified directory, computes the SHA-256 hash for each file, and
	provides methods to identify duplicate files, extract file names with specific hash
	values, and generate a report of duplicates.

	Attributes:
	name_hash_dict (dict): A dictionary where keys are file names (str), and values
	are their corresponding SHA-256 hash values (str), generated from the target
	directory.
	"""

	def __init__(self, directory):
	"""
	Initializes an instance with a dictionary that maps file names to their SHA-256 hash values.

	This constructor scans the target directory, identifies all file paths within the directory, and
	computes the SHA-256 hash for each file. It stores these mappings in an instance attribute
	`name_hash_dict`.

	Args:
	directory (str): The path to the directory whose files will be scanned and hashed.
	"""
	fpaths = list_fpaths_in_dir(directory)
	name_hash_dict = {}
	for each_path in fpaths:
	name_hash_dict.update({each_path.name: hash_by_256(each_path)})
	self.name_hash_dict = name_hash_dict

	def find_duplicate_hash(self):
	"""
	Identifies duplicate hash values and their occurrence counts.

	This method processes the hash values present in the `name_hash_dict` attribute, counts their occurrences
	using a `Counter`, and identifies which hash values appear more than once. It then creates and returns
	a dictionary mapping these duplicate hash values to their counts.

	Returns:
	dict: A dictionary where keys are duplicate hash values (str) and values are their counts (int).
	"""
	tallied_hashes = Counter(self.name_hash_dict.values())
	duplicate_hash_dict = {}
	for hash_digest, hash_count in tallied_hashes.items():
	if hash_count > 1:
	duplicate_hash_dict.update({hash_digest: hash_count})
	return duplicate_hash_dict

	def extract_files_w_hash(self, hash_digest):
	"""
	Extracts the list of file names with a matching hash value from an internal dictionary.

	This method iterates through a dictionary of file names and their corresponding
	hash values, identifies the file names whose hash values match the provided
	`hash_digest`, and returns them in a list.

	Args:
	hash_digest: The hash value to be matched against the stored hash values.

	Returns:
	list: A list of file names (str) whose hash values match the given
	`hash_digest`.
	"""
	duplicated_files = []
	for name_, hash_ in self.name_hash_dict.items():
	if hash_ == hash_digest:
	duplicated_files.append(name_)
	return duplicated_files

	def create_report(self):
	duplicate_hash_dict = self.find_duplicate_hash()
	hash_name_tbl = [['sha256', 'file_name']]
	for duplicate_hash in duplicate_hash_dict:
	duplicated_file_names = self.extract_files_w_hash(duplicate_hash)
	for duplicated_file_name in duplicated_file_names:
	hash_name_tbl.append([duplicate_hash, duplicated_file_name])
	return hash_name_tbl


	def save_as_csv(array_in, save_path):
	with open(save_path, 'w') as f:
	csv_writer = csv.writer(f)
	csv_writer.writerows(array_in)
	print(f'\033[93mSaved: {save_path}\033[0m')


	if __name__ == '__main__':
	# Initialize an instance.
	target_dir = Path(input('\033[93mDIR? >> \033[0m').strip())
	if not Path(target_dir).is_dir():
	raise ValueError(f'\033[mInvalid directory: {target_dir}\033[0m')
	instance = FindDupFile(str(target_dir))
	# Create a Table of duplicated files
	duplicates_tbl = instance.create_report()
	# Save the Table as a CSV file.
	dttm = datetime.now().strftime('%Y%m%d_%H%M')
	fname = f'DUPLICATES_{target_dir.name}_{dttm}.csv'
	fpath = target_dir.parent / fname
	save_as_csv(duplicates_tbl, str(fpath))