Created
September 7, 2012 16:28
-
-
Save fnielsen/3667575 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python2.6 | |
'Finn Årup Nielsen' | |
u'Finn Årup Nielsen' | |
unicode('Finn Årup Nielsen', 'utf-8') | |
# So what is wrong with that?} | |
len(u'Finn Arup Nielsen') | |
len('Finn Årup Nielsen') | |
len(u'Finn Årup Nielsen') | |
# Another example of a problem: Finding words with regular expressions: | |
import re | |
re.findall('\w+', 'Finn Årup Nielsen') | |
# ['Finn', 'rup', 'Nielsen'] | |
re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE) | |
# [u'Finn', u'\xc5rup', u'Nielsen'] | |
# Encoding in Python 3 | |
'Finn Årup Nielsen' | |
u'Finn Årup Nielsen' | |
# Encoding from Unicode | |
u'Rådvad Æblerød'.encode('utf-8') | |
# 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d' | |
u'Rådvad Æblerød'.encode('ascii') | |
# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ... | |
u'Rådvad Æblerød'.encode('ascii', 'ignore') | |
# 'Rdvad blerd' | |
u'Rådvad Æblerød'.encode('ascii', 'replace') | |
# 'R?dvad ?bler?d' | |
u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace') | |
# 'Rådvad Æblerød' | |
# Files | |
f = open('text-with-utf-8.txt', 'wb') | |
f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d') | |
f.close() | |
# $ hexdump -C text-with-utf-8.txt | |
# 00000000 52 c3 a5 64 76 61 64 20 c3 86 62 65 6c 72 c3 b8 |R..dvad ..belr..| | |
# 00000010 64 |d| | |
# File I/O with Python 2 | |
print(len(open('text-with-utf-8.txt').read())) | |
print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8'))) | |
import codecs | |
print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read())) | |
# File I/O with Python 2 default encoding | |
print(len(open('text-with-utf-8.txt').read())) | |
print(len(unicode(open('text-with-utf-8.txt').read()))) | |
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ... | |
# Use ``sys reload''-trick: | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
print(len(open('text-with-utf-8.txt').read())) | |
print(len(unicode(open('text-with-utf-8.txt').read()))) | |
# Python 3 | |
# Python 3 reading with UTF-8 environment | |
# $ LANG=en_US.utf8 ; python3 | |
print(len(open('text-with-utf-8.txt').read())) | |
print(len(open('text-with-utf-8.txt', encoding='utf-8').read())) | |
# Python 3 reading with non-UTF-8 environment | |
# $ LANG=C; python3 | |
print(len(open('text-with-utf-8.txt').read())) | |
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ... | |
print(len(open('text-with-utf-8.txt', encoding='utf-8').read())) | |
for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']: | |
try: | |
s = open('text-with-utf-8.txt', encoding=enc).read() | |
except UnicodeDecodeError: | |
continue | |
print("Read with encoding =", enc) | |
break | |
print(s) | |
# gives you a UnicodeEncodeError | |
print(s.encode('ascii', 'replace')) | |
# b'R??dvad ??belr??d' | |
# Encoding in source code | |
# Python 2 script with UTF-8 encoding | |
#!/usr/bin/python2.6 | |
# -*- coding: utf-8 -*- | |
print("Rådvad Knivfabrik") | |
# Python3 | |
Æ = 3 | |
A = 1 | |
A + Æ | |
""" | |
http://docs.python.org/howto/unicode.html | |
Unicode HOWTO in Python documentation | |
http://diveintopython.org/xml_processing/unicode.html | |
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html | |
Kumar McMillan's talk | |
http://farmdev.com/talks/unicode/ | |
Unicode In Python, Completely Demystified} from PyCon 2008. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment