Created
July 7, 2020 10:08
-
-
Save jerrylususu/7678173a46354992f0ba7117bb61a262 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source:https://gitee.com/wkc/autogbktoutf8/blob/master/autogbktoutf8.py | |
# requires: chardet | |
# tested on py3, possibly working on py2 | |
#coding=utf8 | |
from __future__ import print_function | |
from __future__ import unicode_literals | |
import os | |
import os.path | |
import codecs | |
import chardet | |
from chardet.universaldetector import UniversalDetector | |
import argparse | |
parser = argparse.ArgumentParser(description='') | |
parser.add_argument('path',nargs=1) | |
parser.add_argument('-v',choices=('none','all'),default="none") | |
args = parser.parse_args() | |
path = args.path[0] | |
file_list = [] | |
for root, dirs, files in os.walk(path): | |
for name in files: | |
path = os.path.join(root, name) | |
file_list.append(path) | |
allow_file_type = ['.'+i for i in 'cpp|c|py|java|txt|cc|go|h'.split('|')] | |
all_file_type = list(set([os.path.splitext(i)[1] for i in file_list])) | |
print("all file type:",all_file_type) | |
print("allow file type:",allow_file_type) | |
print("ignore file type:",list(set(all_file_type)-set(allow_file_type))) | |
detector = UniversalDetector() | |
def check(name): | |
if os.path.splitext(name)[1] not in allow_file_type: | |
if args.v=='all':print("ignore",name) | |
return | |
detector.reset() | |
for line in open(name,'rb'): | |
detector.feed(line) | |
if detector.done:break | |
detector.close() | |
if args.v=='all':print(name,detector.result) | |
if detector.result['encoding'] in ['Big5','GB2312','GB18030','EUC-TW', 'HZ-GB-2312', 'ISO-2022-CN']: | |
print("start ",name,end=' ') | |
gbk_to_utf8(name) | |
print("success") | |
def gbk_to_utf8(name): | |
data = codecs.open(name,'r','gbk').read() | |
codecs.open(name,'w','utf8').write(data) | |
for i in file_list: | |
try: | |
check(i) | |
except Exception as e: | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment