Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active February 12, 2025 06:16
Show Gist options
  • Save yeiichi/039953505f5d38c212a5023a12957cc2 to your computer and use it in GitHub Desktop.
Save yeiichi/039953505f5d38c212a5023a12957cc2 to your computer and use it in GitHub Desktop.
Class for detecting duplicate files based on their hash values.
#!/usr/bin/env python3
import csv
from collections import Counter
from datetime import datetime
from pathlib import Path
from list_files_sha256 import hash_by_256, list_fpaths_in_dir
class FindDupFile:
"""
Class for detecting duplicate files based on their hash values.
This class scans a specified directory, computes the SHA-256 hash for each file, and
provides methods to identify duplicate files, extract file names with specific hash
values, and generate a report of duplicates.
Attributes:
name_hash_dict (dict): A dictionary where keys are file names (str), and values
are their corresponding SHA-256 hash values (str), generated from the target
directory.
"""
def __init__(self, directory):
"""
Initializes an instance with a dictionary that maps file names to their SHA-256 hash values.
This constructor scans the target directory, identifies all file paths within the directory, and
computes the SHA-256 hash for each file. It stores these mappings in an instance attribute
`name_hash_dict`.
Args:
directory (str): The path to the directory whose files will be scanned and hashed.
"""
fpaths = list_fpaths_in_dir(directory)
name_hash_dict = {}
for each_path in fpaths:
name_hash_dict.update({each_path.name: hash_by_256(each_path)})
self.name_hash_dict = name_hash_dict
def find_duplicate_hash(self):
"""
Identifies duplicate hash values and their occurrence counts.
This method processes the hash values present in the `name_hash_dict` attribute, counts their occurrences
using a `Counter`, and identifies which hash values appear more than once. It then creates and returns
a dictionary mapping these duplicate hash values to their counts.
Returns:
dict: A dictionary where keys are duplicate hash values (str) and values are their counts (int).
"""
tallied_hashes = Counter(self.name_hash_dict.values())
duplicate_hash_dict = {}
for hash_digest, hash_count in tallied_hashes.items():
if hash_count > 1:
duplicate_hash_dict.update({hash_digest: hash_count})
return duplicate_hash_dict
def extract_files_w_hash(self, hash_digest):
"""
Extracts the list of file names with a matching hash value from an internal dictionary.
This method iterates through a dictionary of file names and their corresponding
hash values, identifies the file names whose hash values match the provided
`hash_digest`, and returns them in a list.
Args:
hash_digest: The hash value to be matched against the stored hash values.
Returns:
list: A list of file names (str) whose hash values match the given
`hash_digest`.
"""
duplicated_files = []
for name_, hash_ in self.name_hash_dict.items():
if hash_ == hash_digest:
duplicated_files.append(name_)
return duplicated_files
def create_report(self):
duplicate_hash_dict = self.find_duplicate_hash()
hash_name_tbl = [['sha256', 'file_name']]
for duplicate_hash in duplicate_hash_dict:
duplicated_file_names = self.extract_files_w_hash(duplicate_hash)
for duplicated_file_name in duplicated_file_names:
hash_name_tbl.append([duplicate_hash, duplicated_file_name])
return hash_name_tbl
def save_as_csv(array_in, save_path):
with open(save_path, 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(array_in)
print(f'\033[93mSaved: {save_path}\033[0m')
if __name__ == '__main__':
# Initialize an instance.
target_dir = Path(input('\033[93mDIR? >> \033[0m').strip())
if not Path(target_dir).is_dir():
raise ValueError(f'\033[mInvalid directory: {target_dir}\033[0m')
instance = FindDupFile(str(target_dir))
# Create a Table of duplicated files
duplicates_tbl = instance.create_report()
# Save the Table as a CSV file.
dttm = datetime.now().strftime('%Y%m%d_%H%M')
fname = f'DUPLICATES_{target_dir.name}_{dttm}.csv'
fpath = target_dir.parent / fname
save_as_csv(duplicates_tbl, str(fpath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment