Created
May 23, 2024 02:54
-
-
Save kota7/ef01d00910bb2b01c3b3dae5682962a0 to your computer and use it in GitHub Desktop.
nkf (https://github.com/nurse/nkf) が使えないときに、テキストファイルのエンコーディングをutf8に変換するスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Usage: | |
| to_utf.py <text file path> | |
| """ | |
| import sys | |
| from typing import BinaryIO | |
| from chardet.universaldetector import UniversalDetector | |
| def _detect_encoding(stream: BinaryIO)-> dict: | |
| detector = UniversalDetector() | |
| for line in stream: | |
| detector.feed(line) | |
| if detector.done: | |
| break | |
| detector.close() | |
| return detector.result | |
| def detect_encoding(textfile: BinaryIO or str): | |
| if type(textfile) == str: | |
| with open(textfile, "rb") as stream: | |
| return _detect_encoding(stream) | |
| else: | |
| _detect_encoding(textfile) | |
| def convert( | |
| textfile: str, | |
| encoding_from: str=None): | |
| if encoding_from is None: | |
| detected = detect_encoding(textfile) | |
| #print(detected) | |
| encoding_from = detected["encoding"] | |
| #print(encoding_from) | |
| for line in open(textfile, encoding=encoding_from): | |
| yield line.rstrip("\n") | |
| if __name__ == "__main__": | |
| textfile = sys.argv[1] | |
| try: | |
| for line in convert(textfile): | |
| print(line) | |
| except: | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment