Last active
April 21, 2018 10:03
-
-
Save akkefa/9e808d5bc8aba16d2f789130b0577a29 to your computer and use it in GitHub Desktop.
Python unicode Handling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Encode: unicode code point to bytes | |
>>> s = u'Café' | |
>>> type(s.encode('utf-8')) | |
<class 'bytes'> | |
Decode: bytes to unicode code point | |
>>> s = bytes('Café', encoding='utf-8') | |
>>> s.decode('utf-8') | |
'Café' | |
Get unicode code point | |
>>> s = u'Café' | |
>>> for _c in s: print('U+%04x' % ord(_c)) | |
// print(s.encode("unicode_escape")) | |
... | |
U+0043 | |
U+0061 | |
U+0066 | |
U+00e9 | |
>>> u = '中文' | |
>>> for _c in u: print('U+%04x' % ord(_c)) | |
... | |
U+4e2d | |
U+6587 | |
# python2 str is equivalent to byte string | |
>>> s = 'Café' # byte string | |
>>> s | |
'Caf\xc3\xa9' | |
>>> type(s) | |
<type 'str'> | |
>>> u = u'Café' # unicode string | |
>>> u | |
u'Caf\xe9' | |
>>> type(u) | |
<type 'unicode'> | |
# python3 str is equivalent to unicode string | |
>>> s = 'Café' | |
>>> type(s) | |
<class 'str'> | |
>>> s | |
'Café' | |
>>> s.encode('utf-8') | |
b'Caf\xc3\xa9' | |
>>> s.encode('utf-8').decode('utf-8') | |
'Café' | |
# python2 take str char as byte character | |
>>> s= 'Café' | |
>>> print [_c for _c in s] | |
['C', 'a', 'f', '\xc3', '\xa9'] | |
>>> len(s) | |
5 | |
>>> s = u'Café' | |
>>> print [_c for _c in s] | |
[u'C', u'a', u'f', u'\xe9'] | |
>>> len(s) | |
4 | |
# python3 take str char as unicode character | |
>>> s = 'Café' | |
>>> print([_c for _c in s]) | |
['C', 'a', 'f', 'é'] | |
>>> len(s) | |
4 | |
>>> bs = bytes(s, encoding='utf-8') | |
>>> print(bs) | |
b'Caf\xc3\xa9' | |
>>> len(bs) | |
5 | |
# unicode normalization | |
# python 3 | |
>>> u1 = 'Café' # unicode string | |
>>> u2 = 'Cafe\u0301' | |
>>> u1, u2 | |
('Café', 'Café') | |
>>> len(u1), len(u2) | |
(4, 5) | |
>>> u1 == u2 | |
False | |
>>> u1.encode('utf-8') # get u1 byte string | |
b'Caf\xc3\xa9' | |
>>> u2.encode('utf-8') # get u2 byte string | |
b'Cafe\xcc\x81' | |
>>> from unicodedata import normalize | |
>>> s1 = normalize('NFC', u1) # get u1 NFC format | |
>>> s2 = normalize('NFC', u2) # get u2 NFC format | |
>>> s1 == s2 | |
True | |
>>> s1.encode('utf-8'), s2.encode('utf-8') | |
(b'Caf\xc3\xa9', b'Caf\xc3\xa9') | |
>>> s1 = normalize('NFD', u1) # get u1 NFD format | |
>>> s2 = normalize('NFD', u2) # get u2 NFD format | |
>>> s1, s2 | |
('Café', 'Café') | |
>>> s1 == s2 | |
True | |
>>> s1.encode('utf-8'), s2.encode('utf-8') | |
(b'Cafe\xcc\x81', b'Cafe\xcc\x81') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment