Last active
July 26, 2025 10:22
-
-
Save platomav/2b1a5455a7b727626d528ae5e1933b76 to your computer and use it in GitHub Desktop.
Duplicate file encoding finder (and optionally deleter), for Posix/NT, written in Python 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # coding=utf-8 | |
| import hashlib | |
| import os | |
| import unicodedata | |
| from argparse import ArgumentParser, Namespace | |
| from chardet import detect as chardet_detector | |
| from charset_normalizer import detect as charset_detector | |
| from tqdm import tqdm | |
| CHARSET_PREFER: list[tuple[str, str]] = [ | |
| ('cp949', 'windows-1250'), | |
| ('euc-jp', 'big5'), | |
| ('macroman', 'cp932'), | |
| ('macroman', 'windows-1256') | |
| ] | |
| def _delete_path(input_path: str) -> None: | |
| if _is_valid_path(input_path=input_path, allow_broken_links=True): | |
| os.remove(input_path) | |
| print(f' {input_path} [Deleted]') | |
| else: | |
| print(f' {input_path} [Error]') | |
| def _is_valid_path(input_path: str, allow_broken_links: bool = False) -> bool: | |
| input_path_abs: str = os.path.abspath(input_path) | |
| if os.path.lexists(input_path_abs): | |
| if not os.path.isdir(input_path_abs): | |
| if allow_broken_links: | |
| return os.path.isfile(input_path_abs) or os.path.islink(input_path_abs) | |
| return os.path.isfile(input_path_abs) | |
| return False | |
| def _get_path_files(check_paths: list) -> list[str]: | |
| path_files: list[str] = [] | |
| for check_path in check_paths: | |
| check_path_abs: str = os.path.abspath(check_path) | |
| if os.path.isdir(check_path_abs): | |
| # noinspection PyArgumentEqualDefault | |
| for root_path, _, file_names in os.walk(check_path_abs, followlinks=False): | |
| for file_name in file_names: | |
| path_files.append(os.path.join(root_path, file_name)) | |
| elif _is_valid_path(input_path=check_path_abs, allow_broken_links=True): | |
| path_files.append(check_path_abs) | |
| return path_files | |
| def _detect_encoding(in_path: str) -> str: | |
| """ Determine encoding via Chardet or Charset Normalizer """ | |
| with open(in_path, 'rb') as in_file: | |
| in_buffer: bytes = in_file.read() | |
| chardet_result: str | None = chardet_detector(in_buffer)['encoding'] | |
| charset_result: str | None = charset_detector(in_buffer)['encoding'] | |
| if chardet_result != charset_result and chardet_result and charset_result: | |
| chardet_result_norm: str = chardet_result.lower().replace('_', '-') | |
| charset_result_norm: str = charset_result.lower().replace('_', '-') | |
| if '-sig' in chardet_result_norm: | |
| encoding_result: str = chardet_result | |
| elif '-sig' in charset_result_norm: | |
| encoding_result = charset_result | |
| elif 'utf-8' in chardet_result_norm: | |
| encoding_result = chardet_result | |
| elif 'utf-8' in charset_result_norm: | |
| encoding_result = charset_result | |
| elif (chardet_result_norm, charset_result_norm) in CHARSET_PREFER: | |
| encoding_result = charset_result | |
| else: | |
| encoding_result = chardet_result | |
| else: | |
| encoding_result = chardet_result or charset_result or 'utf-8' | |
| return encoding_result | |
| def _normalize_text_for_hashing(in_text: str) -> str: | |
| """ Normalize text for consistent hashing across different files """ | |
| # Unicode normalization | |
| text: str = unicodedata.normalize('NFC', in_text) | |
| # Line ending normalization | |
| text = '\n'.join(text.splitlines()) | |
| # Whitespace normalization | |
| text = text.strip() | |
| return text | |
| def check_duplicates(in_paths: list, is_delete: bool) -> None: | |
| file_text_hashes: dict[str, set] = {} | |
| for file_path in tqdm(_get_path_files(in_paths)): | |
| with open(file_path, 'r', encoding=_detect_encoding(file_path), errors='ignore') as file_pointer: | |
| file_text: str = file_pointer.read() | |
| file_text_norm: str = _normalize_text_for_hashing(file_text) | |
| file_text_hash: str = hashlib.sha1(file_text_norm.encode('utf-8')).hexdigest() | |
| if file_text_hash not in file_text_hashes: | |
| file_text_hashes[file_text_hash] = set() | |
| file_text_hashes[file_text_hash].add(file_path) | |
| for file_text_hash, file_paths in file_text_hashes.items(): | |
| file_paths_count: int = len(file_paths) | |
| if file_paths_count > 1: | |
| print(f'\nFound {file_paths_count} files with text hash {file_text_hash}') | |
| for file_index, file_path in enumerate(sorted(file_paths, key=lambda fp: (len(fp), fp))): | |
| if file_index == 0: | |
| print(f' {file_path} [Original]') | |
| else: | |
| if is_delete: | |
| _delete_path(input_path=file_path) | |
| else: | |
| print(f' {file_path} [Duplicate]') | |
| if __name__ == "__main__": | |
| parser: ArgumentParser = ArgumentParser() | |
| parser.add_argument('paths', nargs='*') | |
| parser.add_argument('-d', '--delete', help='delete duplicate files', action='store_true') | |
| arguments: Namespace = parser.parse_args() | |
| if arguments.paths: | |
| check_duplicates(in_paths=arguments.paths, is_delete=arguments.delete) | |
| else: | |
| parser.print_help() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment