Created
April 15, 2025 07:15
-
-
Save yeiichi/e80bdd8c618ba17f6c99e22ddaa4cfb1 to your computer and use it in GitHub Desktop.
Class for detecting the character encoding of files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import subprocess | |
from argparse import ArgumentParser | |
from charset_normalizer import from_path | |
class ChardetJa: | |
"""Class for detecting the character encoding of files. | |
Provides utilities to identify the encoding of a given file using multiple | |
detection methods, including charset-normalizer and nkf. If both methods fail, | |
a default message is displayed indicating failure. Includes helpers for | |
logging success and error messages in color-coded formats. | |
Attributes: | |
file_path (str): The path of the file for which to detect encoding. | |
encoding (str): The detected encoding of the file, or None if detection | |
fails. | |
""" | |
# Constants for terminal colors | |
COLOR_WARNING = '\033[93m' | |
COLOR_ERROR = '\033[31m' | |
COLOR_RESET = '\033[0m' | |
# Error/Default Messages | |
ENCODING_FAILURE_MSG = "Failed to detect encoding." | |
CHARSET_NORMALIZER_NAME = "charset-normalizer" | |
NKF_NAME = "nkf" | |
def __init__(self, file_path): | |
self.file_path = file_path | |
self.encoding = self._detect_encoding() | |
def _print_success(self, method_name, encoding): | |
"""Helper to print success message with detected encoding.""" | |
print(f"{self.COLOR_WARNING}[✓] {method_name}: {encoding}{self.COLOR_RESET}") | |
def _print_error(self, method_name, error_msg): | |
"""Helper to print error message.""" | |
print(f"{self.COLOR_ERROR}[!] {method_name} error: {error_msg}{self.COLOR_RESET}") | |
def _detect_with_charset_normalizer(self): | |
"""Try to detect the encoding using charset-normalizer.""" | |
try: | |
result = from_path(self.file_path) | |
best_guess = result.best() | |
if best_guess and best_guess.encoding: | |
self._print_success(self.CHARSET_NORMALIZER_NAME, best_guess.encoding) | |
return best_guess.encoding | |
except Exception as e: | |
self._print_error(self.CHARSET_NORMALIZER_NAME, str(e)) | |
return None | |
def _detect_with_nkf(self): | |
"""Try to detect the encoding using nkf.""" | |
def is_nkf_available(): | |
"""Check if the nkf command is available in the system PATH.""" | |
import shutil | |
return shutil.which(self.NKF_NAME) is not None | |
def get_nkf_encoding(): | |
"""Run nkf to detect the file's encoding.""" | |
return subprocess.check_output([self.NKF_NAME, '-g', self.file_path], encoding='utf-8').strip() | |
# Check if nkf is installed | |
if not is_nkf_available(): | |
self._print_error(self.NKF_NAME, f"{self.NKF_NAME} is not installed or not found in PATH.") | |
return None | |
# Attempt to run nkf and detect encoding | |
try: | |
nkf_encoding = get_nkf_encoding() | |
if nkf_encoding.lower() not in ('unknown', ''): | |
self._print_success(self.NKF_NAME, nkf_encoding) | |
return nkf_encoding | |
except Exception as e: | |
self._print_error(self.NKF_NAME, f"Error: {str(e)}") | |
return None | |
def _detect_encoding(self): | |
"""Detect the encoding of a file using multiple methods.""" | |
methods = [self._detect_with_charset_normalizer, self._detect_with_nkf] | |
for method in methods: | |
encoding = method() | |
if encoding: | |
return encoding | |
print(f"{self.COLOR_ERROR}[×] {self.ENCODING_FAILURE_MSG}{self.COLOR_RESET}") | |
return None | |
def main(file_path): | |
"""Main function to detect the encoding of a file.""" | |
chardet_ja = ChardetJa(file_path) | |
return chardet_ja.encoding | |
if __name__ == '__main__': | |
parser = ArgumentParser(description="Detect the encoding of a file.") | |
parser.add_argument("file_path", type=str, help="Path to the file to detect.") | |
args = parser.parse_args() | |
main(args.file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment