Skip to content

Instantly share code, notes, and snippets.

@dmikushin
Created January 28, 2025 23:06
Show Gist options
  • Save dmikushin/cf39184e480e6f00851ad8527b2bee24 to your computer and use it in GitHub Desktop.
Save dmikushin/cf39184e480e6f00851ad8527b2bee24 to your computer and use it in GitHub Desktop.
Convert files from JIS (Japanese ASCII) to UTF-8 encoding
#!/usr/bin/env python3
import chardet
import argparse
import mimetypes
import sys
def is_text_file(file_path):
mime_type, _ = mimetypes.guess_type(file_path)
return mime_type is not None and mime_type.startswith('text')
def process_file(file_path):
with open(file_path, 'rb') as file:
raw_data = file.read()
result = chardet.detect(raw_data)
encoding = result['encoding']
confidence = result['confidence']
if encoding:
if 'JIS' in encoding:
print(f"{file_path} has JIS encoding, {confidence*100:.2f}% confidence")
convert_jis_to_utf8(file_path, raw_data, encoding)
else:
print(f"{file_path} has {encoding} encoding, {confidence*100:.2f}% confidence")
def convert_jis_to_utf8(file_path, raw_data, encoding):
try:
decoded_data = raw_data.decode(encoding)
utf8_data = decoded_data.encode('utf-8')
with open(file_path, 'wb') as new_file:
new_file.write(utf8_data)
print(f"Converted {file_path} to UTF-8")
except Exception as e:
print(f"Error converting {file_path} to UTF-8: {e}")
def main():
parser = argparse.ArgumentParser(description="Convert files with JIS encoding to UTF-8 encoding.")
parser.add_argument('file_path', help="Path to the file to be analyzed.")
args = parser.parse_args()
if not is_text_file(args.file_path):
print(f"{args.file_path} is not a text file, skipping")
sys.exit(1)
process_file(args.file_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment