Skip to content

Instantly share code, notes, and snippets.

@kota7
Created May 23, 2024 02:54
Show Gist options
  • Save kota7/ef01d00910bb2b01c3b3dae5682962a0 to your computer and use it in GitHub Desktop.
Save kota7/ef01d00910bb2b01c3b3dae5682962a0 to your computer and use it in GitHub Desktop.
nkf (https://github.com/nurse/nkf) が使えないときに、テキストファイルのエンコーディングをutf8に変換するスクリプト
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Usage:
to_utf.py <text file path>
"""
import sys
from typing import BinaryIO
from chardet.universaldetector import UniversalDetector
def _detect_encoding(stream: BinaryIO)-> dict:
detector = UniversalDetector()
for line in stream:
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result
def detect_encoding(textfile: BinaryIO or str):
if type(textfile) == str:
with open(textfile, "rb") as stream:
return _detect_encoding(stream)
else:
_detect_encoding(textfile)
def convert(
textfile: str,
encoding_from: str=None):
if encoding_from is None:
detected = detect_encoding(textfile)
#print(detected)
encoding_from = detected["encoding"]
#print(encoding_from)
for line in open(textfile, encoding=encoding_from):
yield line.rstrip("\n")
if __name__ == "__main__":
textfile = sys.argv[1]
try:
for line in convert(textfile):
print(line)
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment