Last active
January 28, 2021 10:20
-
-
Save tinkernels/1d62207797a640abb1d8797a29f5f4e0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
from pathlib import Path | |
from chardet.universaldetector import UniversalDetector | |
from binaryornot.check import is_binary | |
base_path = "." | |
encoding_map = { | |
"utf-8-sig": "utf-8" | |
} | |
binary_extnames = set() | |
txt_extnames = set() | |
encodings = set() | |
files_manipulated = [] | |
failed_files = [] | |
except_extnames = ['.orig', '.iconv'] | |
convert_extnames = ['.txt', '.toml', '.py', '.in', '.sh', '.bat', '.h', '.c', '.cpp', '.hpp', '.pyx', ''] | |
detector = UniversalDetector() | |
def convert_file(fpath:str): | |
global encodings | |
global files_manipulated | |
global encoding_map | |
global failed_files | |
global detector | |
detector.reset() | |
with open(fpath, "rb") as fc: | |
for line in fc: | |
detector.feed(line) | |
if detector.done: | |
break | |
detector.close() | |
encodings.add(detector.result['encoding']) | |
if detector.result['confidence'] > .6: | |
encoding_ = detector.result['encoding'] | |
# if encoding_.lower() == "utf-8": | |
# return | |
if encoding_.lower() in encoding_map: | |
encoding_ = encoding_map[encoding_.lower()] | |
ret = os.system( | |
f'iconv -c -f {encoding_} -t utf-8 "{fpath}" > "{fpath}.iconv"') | |
if ret != 0: | |
os.system(f'cp -n "{fpath}" "{fpath}.orig"') | |
failed_files.append( | |
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath}) | |
os.system(f'mv "{fpath}.iconv" "{fpath}"') | |
os.system( | |
f'vim "+set ff=unix nobomb tabstop=4 shiftwidth=4 expandtab" "+retab" "+wq" "{fpath}"') | |
files_manipulated.append( | |
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath}) | |
else: | |
failed_files.append( | |
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath}) | |
for dirName, subdirList, fileList in os.walk(base_path): | |
for fname in fileList: | |
f_path = f"{Path(base_path).joinpath(dirName).joinpath(fname)}" | |
suffix_ = Path(f_path).suffix.lower() | |
if is_binary(f_path): | |
binary_extnames.add(suffix_) | |
else: | |
txt_extnames.add(suffix_) | |
if suffix_ not in except_extnames and suffix_ in convert_extnames: | |
convert_file(fpath=f_path) | |
print(f"{'-'*32}\nconverted file:") | |
[print(f"{f_['confidence']} {f_['encoding']}\t{f_['path']}") | |
for f_ in files_manipulated] | |
print(f"{'-'*32}\nconvert failed file:") | |
[print(f"{f_['confidence']} {f_['encoding']}\t{f_['path']}") | |
for f_ in failed_files] | |
print(f"{'-'*32}\nbinary file ext: {binary_extnames}") | |
print(f"{'-'*32}\ntxt file ext: {txt_extnames}") | |
print(f"{'-'*32}\ndetected encodings: {encodings}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment