Last active
May 29, 2020 17:50
-
-
Save BlaayLock/e496d4341a820cb28629a7d38757a3f3 to your computer and use it in GitHub Desktop.
Python: Convert Unicode to ASCII without errors, utf8 -> cp1251
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python2 | |
# //coding: utf-8 | |
# coding=utf-8 | |
# -*- coding: utf-8 -*- | |
# vim: set fileencoding=utf-8 : | |
a = u'1a' | |
a = a.decode('utf-8').encode('cp1251') | |
print a | |
a = u'1\u0430' | |
a = a.encode('UTF-8').decode('UTF-8').encode('cp1251') | |
print a | |
#~ Не нужно путать юникод и UTF-8 !!! utf-8, как и cp1251 с точки зрения питона2.* - массив байт. | |
#~ Часто парюсь, думая что UTF-8 - это unicode. Так же, чтобы не возникало путаницы, рекомендую в начале файла указывать кодировку исходного кода в виде волшебного комментария. | |
#~ Подробно py-my.ru/post/4bfb3c6a1d41c846bc00009b | |
msg=u'1\u0430' | |
if len(str(msg))!==0 | |
if isinstance(msg, unicode): | |
#~ print type(msg) | |
#~ print encodings(get_codepage(msg)) | |
print msg.encode('UTF-8').decode('UTF-8').encode('cp1251') | |
else: | |
if get_codepage(msg)=='UTF-8': | |
print msg.decode('UTF-8').encode('cp1251') | |
else: | |
print msg | |
encodings = { | |
'UTF-8': 'utf-8', | |
'CP1251': 'windows-1251', | |
'KOI8-R': 'koi8-r', | |
'IBM866': 'ibm866', | |
'ISO-8859-5': 'iso-8859-5', | |
'MAC': 'mac', | |
} | |
def get_codepage(str = None): | |
uppercase = 1 | |
lowercase = 3 | |
utfupper = 5 | |
utflower = 7 | |
codepages = {} | |
for enc in encodings.keys(): | |
codepages[enc] = 0 | |
if str is not None and len(str) > 0: | |
last_simb = 0 | |
for simb in str: | |
simb_ord = ord(simb) | |
"""non-russian characters""" | |
if simb_ord < 128 or simb_ord > 256: | |
continue | |
"""UTF-8""" | |
if last_simb == 208 and (143 < simb_ord < 176 or simb_ord == 129): | |
codepages['UTF-8'] += (utfupper * 2) | |
if (last_simb == 208 and (simb_ord == 145 or 175 < simb_ord < 192)) \ | |
or (last_simb == 209 and (127 < simb_ord < 144)): | |
codepages['UTF-8'] += (utflower * 2) | |
"""CP1251""" | |
if 223 < simb_ord < 256 or simb_ord == 184: | |
codepages['CP1251'] += lowercase | |
if 191 < simb_ord < 224 or simb_ord == 168: | |
codepages['CP1251'] += uppercase | |
"""KOI8-R""" | |
if 191 < simb_ord < 224 or simb_ord == 163: | |
codepages['KOI8-R'] += lowercase | |
if 222 < simb_ord < 256 or simb_ord == 179: | |
codepages['KOI8-R'] += uppercase | |
"""IBM866""" | |
if 159 < simb_ord < 176 or 223 < simb_ord < 241: | |
codepages['IBM866'] += lowercase | |
if 127 < simb_ord < 160 or simb_ord == 241: | |
codepages['IBM866'] += uppercase | |
"""ISO-8859-5""" | |
if 207 < simb_ord < 240 or simb_ord == 161: | |
codepages['ISO-8859-5'] += lowercase | |
if 175 < simb_ord < 208 or simb_ord == 241: | |
codepages['ISO-8859-5'] += uppercase | |
"""MAC""" | |
if 221 < simb_ord < 255: | |
codepages['MAC'] += lowercase | |
if 127 < simb_ord < 160: | |
codepages['MAC'] += uppercase | |
last_simb = simb_ord | |
idx = '' | |
max = 0 | |
for item in codepages: | |
if codepages[item] > max: | |
max = codepages[item] | |
idx = item | |
return idx | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment