Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created April 15, 2025 07:15
Show Gist options
  • Save yeiichi/e80bdd8c618ba17f6c99e22ddaa4cfb1 to your computer and use it in GitHub Desktop.
Save yeiichi/e80bdd8c618ba17f6c99e22ddaa4cfb1 to your computer and use it in GitHub Desktop.
Class for detecting the character encoding of files.
#!/usr/bin/env python3
import subprocess
from argparse import ArgumentParser
from charset_normalizer import from_path
class ChardetJa:
"""Class for detecting the character encoding of files.
Provides utilities to identify the encoding of a given file using multiple
detection methods, including charset-normalizer and nkf. If both methods fail,
a default message is displayed indicating failure. Includes helpers for
logging success and error messages in color-coded formats.
Attributes:
file_path (str): The path of the file for which to detect encoding.
encoding (str): The detected encoding of the file, or None if detection
fails.
"""
# Constants for terminal colors
COLOR_WARNING = '\033[93m'
COLOR_ERROR = '\033[31m'
COLOR_RESET = '\033[0m'
# Error/Default Messages
ENCODING_FAILURE_MSG = "Failed to detect encoding."
CHARSET_NORMALIZER_NAME = "charset-normalizer"
NKF_NAME = "nkf"
def __init__(self, file_path):
self.file_path = file_path
self.encoding = self._detect_encoding()
def _print_success(self, method_name, encoding):
"""Helper to print success message with detected encoding."""
print(f"{self.COLOR_WARNING}[✓] {method_name}: {encoding}{self.COLOR_RESET}")
def _print_error(self, method_name, error_msg):
"""Helper to print error message."""
print(f"{self.COLOR_ERROR}[!] {method_name} error: {error_msg}{self.COLOR_RESET}")
def _detect_with_charset_normalizer(self):
"""Try to detect the encoding using charset-normalizer."""
try:
result = from_path(self.file_path)
best_guess = result.best()
if best_guess and best_guess.encoding:
self._print_success(self.CHARSET_NORMALIZER_NAME, best_guess.encoding)
return best_guess.encoding
except Exception as e:
self._print_error(self.CHARSET_NORMALIZER_NAME, str(e))
return None
def _detect_with_nkf(self):
"""Try to detect the encoding using nkf."""
def is_nkf_available():
"""Check if the nkf command is available in the system PATH."""
import shutil
return shutil.which(self.NKF_NAME) is not None
def get_nkf_encoding():
"""Run nkf to detect the file's encoding."""
return subprocess.check_output([self.NKF_NAME, '-g', self.file_path], encoding='utf-8').strip()
# Check if nkf is installed
if not is_nkf_available():
self._print_error(self.NKF_NAME, f"{self.NKF_NAME} is not installed or not found in PATH.")
return None
# Attempt to run nkf and detect encoding
try:
nkf_encoding = get_nkf_encoding()
if nkf_encoding.lower() not in ('unknown', ''):
self._print_success(self.NKF_NAME, nkf_encoding)
return nkf_encoding
except Exception as e:
self._print_error(self.NKF_NAME, f"Error: {str(e)}")
return None
def _detect_encoding(self):
"""Detect the encoding of a file using multiple methods."""
methods = [self._detect_with_charset_normalizer, self._detect_with_nkf]
for method in methods:
encoding = method()
if encoding:
return encoding
print(f"{self.COLOR_ERROR}[×] {self.ENCODING_FAILURE_MSG}{self.COLOR_RESET}")
return None
def main(file_path):
"""Main function to detect the encoding of a file."""
chardet_ja = ChardetJa(file_path)
return chardet_ja.encoding
if __name__ == '__main__':
parser = ArgumentParser(description="Detect the encoding of a file.")
parser.add_argument("file_path", type=str, help="Path to the file to detect.")
args = parser.parse_args()
main(args.file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment