Created
April 4, 2012 05:21
-
-
Save reorx/2297990 to your computer and use it in GitHub Desktop.
char detect & convert
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import chardet | |
BROTHER_ENCODINGS = [ | |
('GB2312', 'GBK', 'GB18030'), | |
] | |
def strict_detect(text): | |
""" | |
Detect the encoding of a string strictly, | |
if no exact encoding ('decode()' success) was found, an exception will be raise. | |
return a str represent of the encoding. | |
""" | |
def get_brothers(ec): | |
for i in BROTHER_ENCODINGS: | |
if ec in i: | |
return i | |
return None | |
res = chardet.detect(text) | |
if res['confidence'] < 1: | |
possible_encodings = get_brothers(res['encoding']) or (res['encoding'], ) | |
else: | |
possible_encodings = (res['encoding'], ) | |
for ec in possible_encodings: | |
try: | |
text.decode(ec) | |
except UnicodeDecodeError: | |
pass | |
else: | |
return ec | |
raise DetectionFailed('raw result from chardet: %s' % res) | |
class DetectionFailed(Exception): | |
pass | |
def convert_string(s, to_ec, from_ec=None): | |
""" | |
convert a string to an indicated encoding, | |
if no source encoding was given, try to detect it. | |
""" | |
if not from_ec: | |
from_ec = strict_detect(s) | |
return s.decode(from_ec).encode(to_ec) | |
def convert_file(fname, to_ec, from_ec=None, replace=False, to_name=None): | |
""" | |
convert a file to an indicated encoding. | |
Arguments:: | |
:from_ec | |
if no source encoding was given, try to detect it. | |
:replace | |
if replace was set to True, it will cover to_name argument, | |
generate a new encoded file to replace the previous one. | |
:to_name | |
only effective when replace was False, | |
if to_name was given, use it as the new file's name, | |
if to_name was not given, try to generate a new name based on the old name and encoding. | |
""" | |
if replace: | |
to_name = fname | |
else: | |
if not to_name: | |
fname_parts = fname.split('.') | |
if len(fname_parts) < 2: | |
fpre = fname | |
fext = '' | |
else: | |
fpre = ''.join(fname_parts[:-1]) | |
fext = fname_parts[-1] | |
to_name = '%s_%s.%s' % (fpre, to_ec, fext) | |
# print('convert %s to %s from encoding %s to %s') | |
# print ('done') | |
with open(fname, 'r') as source: | |
buf = source.read() | |
assert buf, 'file to convert should have content' | |
with open(to_name, 'w') as target: | |
target.write(convert_string(buf, to_ec, from_ec)) | |
def main(): | |
import optparse | |
parser = optparse.OptionParser(usage='Usage: %prog [options] file_to_convert [, new_file_name]') | |
parser.add_option('-f', '--from-ec', type='str', | |
help='the original encoding of the file, if not given, detecting will be tried') | |
parser.add_option('-t', '--to-ec', type='str', default='utf8', | |
help='the output encoding of the new file, if not given, use utf8 as default') | |
parser.add_option('-p', '--replace', action='store_true', default=False, | |
help='if indicated, the new file will replace the old one, use it carefully') | |
options, args = parser.parse_args() | |
if len(args) == 1: | |
fname = args[0] | |
to_name = None | |
elif len(args) == 2: | |
fname = args[0] | |
to_name = args[1] | |
else: | |
parser.error('incorrect number of arguments, must be one or two') | |
convert_file(fname, options.to_ec, | |
from_ec=options.from_ec, | |
replace=options.replace, | |
to_name=to_name) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment