fnielsen · September 7, 2012 16:28
diff --git a/encoding.py b/encoding.py
 # python2.6

 'Finn Årup Nielsen'

 u'Finn Årup Nielsen'

 unicode('Finn Årup Nielsen', 'utf-8')


 # So what is wrong with that?}

 len(u'Finn Arup Nielsen')

 len('Finn Årup Nielsen')

 len(u'Finn Årup Nielsen')


 # Another example of a problem: Finding words with regular expressions:

 import re 
 re.findall('\w+', 'Finn Årup Nielsen')
 # ['Finn', 'rup', 'Nielsen']

 re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE)
 # [u'Finn', u'\xc5rup', u'Nielsen']


 # Encoding in Python 3

 'Finn Årup Nielsen'

 u'Finn Årup Nielsen'


 # Encoding from Unicode

 u'Rådvad Æblerød'.encode('utf-8')
 # 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d'

 u'Rådvad Æblerød'.encode('ascii')
 # UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ...

 u'Rådvad Æblerød'.encode('ascii', 'ignore')
 # 'Rdvad blerd'

 u'Rådvad Æblerød'.encode('ascii', 'replace')
 # 'R?dvad ?bler?d'

 u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace')
 # 'R&#229;dvad &#198;bler&#248;d'


 # Files
 f = open('text-with-utf-8.txt', 'wb')
 f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d')
 f.close()

 # $ hexdump -C text-with-utf-8.txt 
 # 00000000  52 c3 a5 64 76 61 64 20  c3 86 62 65 6c 72 c3 b8  |R..dvad ..belr..|
 # 00000010  64                                                |d|


 # File I/O with Python 2

 print(len(open('text-with-utf-8.txt').read()))

 print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8')))

 import codecs
 print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read()))



 # File I/O with Python 2 default encoding

 print(len(open('text-with-utf-8.txt').read()))

 print(len(unicode(open('text-with-utf-8.txt').read())))
 # UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ...


 # Use ``sys reload''-trick: 

 import sys
 reload(sys)
 sys.setdefaultencoding('utf-8')
 print(len(open('text-with-utf-8.txt').read()))

 print(len(unicode(open('text-with-utf-8.txt').read())))



 # Python 3 

 # Python 3 reading with UTF-8 environment
 # $ LANG=en_US.utf8 ; python3

 print(len(open('text-with-utf-8.txt').read()))

 print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


 # Python 3 reading with non-UTF-8 environment
 # $ LANG=C; python3 

 print(len(open('text-with-utf-8.txt').read()))
 UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ...

 print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


 for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']:
  try:
    s = open('text-with-utf-8.txt', encoding=enc).read()
  except UnicodeDecodeError:
    continue
  print("Read with encoding =", enc)
  break

 print(s) 
 # gives you a UnicodeEncodeError

 print(s.encode('ascii', 'replace'))
 # b'R??dvad ??belr??d'

 # Encoding in source code

 # Python 2 script with UTF-8 encoding

 #!/usr/bin/python2.6
 # -*- coding: utf-8 -*-
 print("Rådvad Knivfabrik")

 # Python3
 Æ = 3 
 A = 1
 A + Æ

 """
 http://docs.python.org/howto/unicode.html
 Unicode HOWTO in Python documentation

 http://diveintopython.org/xml_processing/unicode.html

 http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html

 Kumar McMillan's talk
 http://farmdev.com/talks/unicode/
 Unicode In Python, Completely Demystified} from PyCon 2008.
 """
	# python2.6

	'Finn Årup Nielsen'

	u'Finn Årup Nielsen'

	unicode('Finn Årup Nielsen', 'utf-8')


	# So what is wrong with that?}

	len(u'Finn Arup Nielsen')

	len('Finn Årup Nielsen')

	len(u'Finn Årup Nielsen')


	# Another example of a problem: Finding words with regular expressions:

	import re
	re.findall('\w+', 'Finn Årup Nielsen')
	# ['Finn', 'rup', 'Nielsen']

	re.findall('\w+', u'Finn Årup Nielsen', re.UNICODE)
	# [u'Finn', u'\xc5rup', u'Nielsen']


	# Encoding in Python 3

	'Finn Årup Nielsen'

	u'Finn Årup Nielsen'


	# Encoding from Unicode

	u'Rådvad Æblerød'.encode('utf-8')
	# 'R\xc3\xa5dvad \xc3\x86bler\xc3\xb8d'

	u'Rådvad Æblerød'.encode('ascii')
	# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe5' ...

	u'Rådvad Æblerød'.encode('ascii', 'ignore')
	# 'Rdvad blerd'

	u'Rådvad Æblerød'.encode('ascii', 'replace')
	# 'R?dvad ?bler?d'

	u'Rådvad Æblerød'.encode('ascii', 'xmlcharrefreplace')
	# 'Rådvad Æblerød'


	# Files
	f = open('text-with-utf-8.txt', 'wb')
	f.write('R\xc3\xa5dvad \xc3\x86belr\xc3\xb8d')
	f.close()

	# $ hexdump -C text-with-utf-8.txt
	# 00000000 52 c3 a5 64 76 61 64 20 c3 86 62 65 6c 72 c3 b8 \|R..dvad ..belr..\|
	# 00000010 64 \|d\|


	# File I/O with Python 2

	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read(), 'utf-8')))

	import codecs
	print(len(codecs.open('text-with-utf-8.txt', encoding='utf-8').read()))



	# File I/O with Python 2 default encoding

	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read())))
	# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in ...


	# Use ``sys reload''-trick:

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')
	print(len(open('text-with-utf-8.txt').read()))

	print(len(unicode(open('text-with-utf-8.txt').read())))



	# Python 3

	# Python 3 reading with UTF-8 environment
	# $ LANG=en_US.utf8 ; python3

	print(len(open('text-with-utf-8.txt').read()))

	print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


	# Python 3 reading with non-UTF-8 environment
	# $ LANG=C; python3

	print(len(open('text-with-utf-8.txt').read()))
	UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 ...

	print(len(open('text-with-utf-8.txt', encoding='utf-8').read()))


	for enc in ['ascii', 'ISO8859-1', 'latin1', 'utf-8']:
	try:
	s = open('text-with-utf-8.txt', encoding=enc).read()
	except UnicodeDecodeError:
	continue
	print("Read with encoding =", enc)
	break

	print(s)
	# gives you a UnicodeEncodeError

	print(s.encode('ascii', 'replace'))
	# b'R??dvad ??belr??d'

	# Encoding in source code

	# Python 2 script with UTF-8 encoding

	#!/usr/bin/python2.6
	# -- coding: utf-8 --
	print("Rådvad Knivfabrik")

	# Python3
	Æ = 3
	A = 1
	A + Æ

	"""
	http://docs.python.org/howto/unicode.html
	Unicode HOWTO in Python documentation

	http://diveintopython.org/xml_processing/unicode.html

	http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html

	Kumar McMillan's talk
	http://farmdev.com/talks/unicode/
	Unicode In Python, Completely Demystified} from PyCon 2008.
	"""
No results found