Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Created September 7, 2012 16:28
Show Gist options
  • Save fnielsen/3667575 to your computer and use it in GitHub Desktop.
Save fnielsen/3667575 to your computer and use it in GitHub Desktop.
# python2.6
'Finn Årup Nielsen'
u'Finn Årup Nielsen'
unicode('Finn Årup Nielsen', 'utf-8')
# So what is wrong with that?}
len(u'Finn Arup Nielsen')
len('Finn Årup Nielsen')
len(u'Finn Årup Nielsen')
# Another example of a problem: Finding words with regular expressions:
import re
re.findall('\w+', 'Finn Årup Nielsen')
# ['Finn', 'rup', 'Nielsen']
re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE)
# [u'Finn', u'\xc5rup', u'Nielsen']
# Encoding in Python 3
'Finn Årup Nielsen'
u'Finn Årup Nielsen'
# Encoding from Unicode
u'Rådvad Æblerød'.encode('utf-8')
# 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d'
u'Rådvad Æblerød'.encode('ascii')
# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ...
u'Rådvad Æblerød'.encode('ascii', 'ignore')
# 'Rdvad blerd'
u'Rådvad Æblerød'.encode('ascii', 'replace')
# 'R?dvad ?bler?d'
u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace')
# 'Rådvad Æblerød'
# Files
f = open('text-with-utf-8.txt', 'wb')
f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d')
f.close()
# $ hexdump -C text-with-utf-8.txt
# 00000000 52 c3 a5 64 76 61 64 20 c3 86 62 65 6c 72 c3 b8 |R..dvad ..belr..|
# 00000010 64 |d|
# File I/O with Python 2
print(len(open('text-with-utf-8.txt').read()))
print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8')))
import codecs
print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read()))
# File I/O with Python 2 default encoding
print(len(open('text-with-utf-8.txt').read()))
print(len(unicode(open('text-with-utf-8.txt').read())))
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ...
# Use ``sys reload''-trick:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print(len(open('text-with-utf-8.txt').read()))
print(len(unicode(open('text-with-utf-8.txt').read())))
# Python 3
# Python 3 reading with UTF-8 environment
# $ LANG=en_US.utf8 ; python3
print(len(open('text-with-utf-8.txt').read()))
print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))
# Python 3 reading with non-UTF-8 environment
# $ LANG=C; python3
print(len(open('text-with-utf-8.txt').read()))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ...
print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))
for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']:
try:
s = open('text-with-utf-8.txt', encoding=enc).read()
except UnicodeDecodeError:
continue
print("Read with encoding =", enc)
break
print(s)
# gives you a UnicodeEncodeError
print(s.encode('ascii', 'replace'))
# b'R??dvad ??belr??d'
# Encoding in source code
# Python 2 script with UTF-8 encoding
#!/usr/bin/python2.6
# -*- coding: utf-8 -*-
print("Rådvad Knivfabrik")
# Python3
Æ = 3
A = 1
A + Æ
"""
http://docs.python.org/howto/unicode.html
Unicode HOWTO in Python documentation
http://diveintopython.org/xml_processing/unicode.html
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html
Kumar McMillan's talk
http://farmdev.com/talks/unicode/
Unicode In Python, Completely Demystified} from PyCon 2008.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment