Created
November 25, 2014 07:49
-
-
Save d2207197/7ce126ebf5660bfbb7b3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"celltoolbar": "Slideshow", | |
"name": "", | |
"signature": "sha256:c12e1eb8f4876bb92f3ecf8dff9af0d4e551976eee964d5f3a9af4fffa7f588a" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print '\u87d2'.decode()" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"outputs": [ | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-7-3aba86b61646>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0;34m'\u87d2'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"# Str/Unicode in Python 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"## The truth about Python 2 str" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"type('\u87d2')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
"str" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"len('\u87d2')\n" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"list('\u87d2')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"['\\xe8', '\\x9f', '\\x92']" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"- Python 2 str is ***Byte Array***, encoded from characters.\n", | |
"- **methods/functions** are manipulating ***bytes*** not ***characters***." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'\u87d2'[1]" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 31, | |
"text": [ | |
"'\\x9f'" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'\u87d2'.split('\\x9f')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 32, | |
"text": [ | |
"['\\xe8', '\\x92']" | |
] | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sorted('\u87d2\u86c7')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 35, | |
"text": [ | |
"['\\x87', '\\x92', '\\x9b', '\\x9f', '\\xe8', '\\xe8']" | |
] | |
} | |
], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print ''.join(sorted('\u87d2\u86c7'))" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\n" | |
] | |
} | |
], | |
"prompt_number": 109 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"# unicode type - sequence of real character (not byte)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(u'\u87d2')\n", | |
"print len(u'\u87d2')\n", | |
"print list(u'\u87d2')\n", | |
"print u'\u87d2'[0]\n", | |
"print ''.join(sorted(u'\u87d2\u86c7'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'unicode'>\n", | |
"1\n", | |
"[u'\\u87d2']\n", | |
"\u87d2\n", | |
"\u86c7\u87d2\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"# str <-> unicode\n", | |
"- `str.decode()` -> **unicode**\n", | |
"- `unicode.encode()` -> **str**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print repr('\u87d2'.decode('utf-8'))\n", | |
"print repr(u'\u87d2'.encode('utf-8'))\n", | |
"print u'\u87d2' == '\u87d2'.decode('utf8')\n", | |
"print u'\u87d2'.encode('utf8') == '\u87d2'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"u'\\u87d2'\n", | |
"'\\xe8\\x9f\\x92'\n", | |
"True\n", | |
"True\n" | |
] | |
} | |
], | |
"prompt_number": 126 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## if we encode a str or decode a unicode" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'\u6211'.encode('big5') " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe6 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-132-b5881213b084>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m'\u6211'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'big5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe6 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 132 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"source": [ | |
"**which means**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"'\u6211'.decode().encode('big5') " | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe6 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-133-37174db75bb4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m'\u6211'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'big5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe6 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 133 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"u'\u6211'.decode('big5')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"outputs": [ | |
{ | |
"ename": "UnicodeEncodeError", | |
"evalue": "'ascii' codec can't encode character u'\\u6211' in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-134-798a264c3682>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34mu'\u6211'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'big5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeEncodeError\u001b[0m: 'ascii' codec can't encode character u'\\u6211' in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 134 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"source": [ | |
"**which means**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"u'\u6211'.encode().decode('big5') # which means" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "fragment" | |
} | |
}, | |
"outputs": [ | |
{ | |
"ename": "UnicodeEncodeError", | |
"evalue": "'ascii' codec can't encode character u'\\u6211' in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-135-46fcdf96d127>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34mu'\u6211'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'big5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeEncodeError\u001b[0m: 'ascii' codec can't encode character u'\\u6211' in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 135 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"# Default Encoding \n", | |
"- Python 2: ascii\n", | |
"- Python 3: utf-8" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import sys\n", | |
"sys.getdefaultencoding()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 137, | |
"text": [ | |
"'ascii'" | |
] | |
} | |
], | |
"prompt_number": 137 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- `'\u87d2'.decode() == '\u87d2'.decode('ascii')`\n", | |
"- `u'\u87d2'.encode() == u'\u87d2'.encode('ascii')`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## ***!! please don't encode a str or decode a unicode !!***\n", | |
"\n", | |
"- ~~`str.encode()`~~\n", | |
"- ~~`unicode.decode()`~~" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"# Different Encodings\n", | |
"- Python Encoding List (https://docs.python.org/2/library/codecs.html#standard-encodings)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print '\u00e9 source\\t', list('\u00e9')\n", | |
"print '\u00e9 utf8\\t\\t', list('\u00e9'.decode('utf-8').encode('utf8'))\n", | |
"print '\u00e9 latin-1\\t', list('\u00e9'.decode('utf-8').encode('iso-8859-1'))\n", | |
"print '\u00e9 cp1140\\t', list('\u00e9'.decode('utf-8').encode('cp1140'))\n", | |
"print \n", | |
"print '\u87d2 source\\t', list('\u87d2')\n", | |
"print '\u87d2 utf-8\\t', list('\u87d2'.decode('utf-8').encode('utf-8'))\n", | |
"print '\u87d2 big5\\t\\t', list('\u87d2'.decode('utf-8').encode('big5'))\n", | |
"print '\u87d2 gbk\\t\\t', list('\u87d2'.decode('utf-8').encode('gbk'))\n", | |
"print '\u87d2 utf-16\\t', list('\u87d2'.decode('utf-8').encode('utf-16'))\n", | |
"print '\u87d2 shift_jis\\t', list('\u87d2'.decode('utf-8').encode('shift_jis'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\u00e9 source\t['\\xc3', '\\xa9']\n", | |
"\u00e9 utf8\t\t['\\xc3', '\\xa9']\n", | |
"\u00e9 latin-1\t['\\xe9']\n", | |
"\u00e9 cp1140\t['Q']\n", | |
"\n", | |
"\u87d2 source\t['\\xe8', '\\x9f', '\\x92']\n", | |
"\u87d2 utf-8\t['\\xe8', '\\x9f', '\\x92']\n", | |
"\u87d2 big5\t\t['\\xc1', '\\xaf']\n", | |
"\u87d2 gbk\t\t['\\xf2', '\\xfe']\n", | |
"\u87d2 utf-16\t['\\xff', '\\xfe', '\\xd2', '\\x87']\n", | |
"\u87d2 shift_jis\t['\\xe5', '\\xbb']\n" | |
] | |
} | |
], | |
"prompt_number": 106 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"# Source Code Encoding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print list('\u87d2') # Why '\u87d2' is in UTF-8?" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['\\xe8', '\\x9f', '\\x92']\n" | |
] | |
} | |
], | |
"prompt_number": 89 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- PEP 263 - PEP 0263 -- Defining Python Source Code Encodings(https://www.python.org/dev/peps/pep-0263/)\n", | |
"- default: ASCII\n", | |
"\n", | |
"- magic comment in first or second line in the file\n", | |
" # coding=<encoding name>\n", | |
"- or\n", | |
" #!/usr/bin/env python\n", | |
" # -*- coding: <encoding name> -*-\n", | |
"- or\n", | |
" #!/usr/bin/env python\n", | |
" # vim: set fileencoding=<encoding name> :" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"# Functions/Operators/Methods result on unicode and str" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type('P' + u'Y')\n", | |
"print type('\u87d2' + u'\u86c7')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'unicode'>\n" | |
] | |
}, | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-141-2411cd12ab6c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'P'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34mu'Y'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\u87d2'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34mu'\u86c7'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 141 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(u'P' + 'Y')\n", | |
"print type(u'\u87d2' + '\u86c7')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'unicode'>\n" | |
] | |
}, | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-142-b37886e8a94f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'P'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'Y'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'\u87d2'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'\u86c7'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 142 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type('{}'.format(u'a'))\n", | |
"print type('{}'.format('\u87d2'))\n", | |
"print type('{}'.format(u'\u87d2'))" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'str'>\n", | |
"<type 'str'>\n" | |
] | |
}, | |
{ | |
"ename": "UnicodeEncodeError", | |
"evalue": "'ascii' codec can't encode character u'\\u87d2' in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-153-a662ff099551>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\u87d2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'\u87d2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeEncodeError\u001b[0m: 'ascii' codec can't encode character u'\\u87d2' in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 153 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(u'{}'.format('a'))\n", | |
"print type(u'{}'.format(u'\u87d2'))\n", | |
"print type(u'{}'.format('\u87d2'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'unicode'>\n", | |
"<type 'unicode'>\n" | |
] | |
}, | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-151-3a1f8b464437>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'\u87d2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu'{}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\u87d2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 151 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print type(' '.join([u'P', 'Y']))\n", | |
"print type(' '.join([u'\u87d2', u'\u86c7']))\n", | |
"print type(u' '.join(['\u87d2', '\u86c7']))" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'unicode'>\n", | |
"<type 'unicode'>\n" | |
] | |
}, | |
{ | |
"ename": "UnicodeDecodeError", | |
"evalue": "'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-159-0b9b97120070>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' '\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34mu'P'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Y'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' '\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34mu'\u87d2'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mu'\u86c7'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu' '\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'\u87d2'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\u86c7'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xe8 in position 0: ordinal not in range(128)" | |
] | |
} | |
], | |
"prompt_number": 159 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"# 3 Rules\n", | |
"1. Decode all inputs (to unicode)\n", | |
"1. Unicode everywhere\n", | |
"1. Encode outputs (to str)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## Decode Inputs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for line in fileinput.input():\n", | |
" line = line.decode('utf-8')" | |
], | |
"language": "python", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "-" | |
} | |
}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import codecs\n", | |
"with codecs.open('/path/to/file', 'r', encoding = 'utf-8') as the_file:\n", | |
" for line in the_file:\n", | |
" # ..." | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## Unicode everywhere\n", | |
"- always use u'...' for non-ascii characters\n", | |
"- always use u'...' for \n", | |
" 1. `u'{}'.format(var)` \n", | |
" 2. `u'%s' % var` *(before python 2.6)*\n", | |
"- use `unicode()`, `unicode.split()`, ... for replacing `str()`, `str.split()`, ..." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## Encode Outputs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open('/path/to/file','w') as the_file:\n", | |
" the_file.write(u'\u87d2'.encode('utf-8'))\n", | |
" print >> the_file, u'\u86c7'.encode('utf-8')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with codecs.open('/path/to/file','w', encoding = 'utf-8') as the_file:\n", | |
" the_file.write(u'\u87d2')\n", | |
" print >> the_file, u'\u86c7'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print u'\u87d2'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "subslide" | |
} | |
}, | |
"source": [ | |
"## print would automatically encode unicode\n", | |
"\n", | |
"- **let locale decide, don't encode by yourself. **\n", | |
"\n", | |
"```\n", | |
"$ LC_ALL=zh_TW.UTF-8 python << EOF\n", | |
"> # coding=utf-8\n", | |
"> print u\"unicode\\t\\t\u87d2\"\n", | |
"> print \"utf-8 str\\t\u87d2\"\n", | |
"> print u\"big5 str\\t\u87d2\".encode(\"big5\")\n", | |
"> EOF\n", | |
"unicode\t\t\u87d2\n", | |
"utf-8 str\t\u87d2\n", | |
"big5 str\t\ufffd\n", | |
"\n", | |
"$ LC_ALL=zh_TW.Big5 python << EOF\n", | |
"> # coding=utf-8\n", | |
"> print u\"unicode\\t\\t\u87d2\"\n", | |
"> print \"utf-8 str\\t\u87d2\"\n", | |
"> print u\"big5 str\\t\u87d2\".encode(\"big5\")\n", | |
"> EOF\n", | |
"unicode\t\t\ufffd\n", | |
"utf-8 str\t\u87d2\n", | |
"big5 str\t\ufffd\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Python 3\n", | |
"- python 2 str (`'something'`) == python 3 bytes (`b'something'`)\n", | |
"- python 3 unicode `u'something'` == python 3 str `'something'`\n", | |
"\n", | |
" In [8]: '\u87d2'.encode('utf-8')\n", | |
" Out[8]: b'\\xe8\\x9f\\x92'\n", | |
"\n", | |
" In [9]: type('\u87d2'.encode('utf-8'))\n", | |
" Out[9]: bytes\n", | |
"\n", | |
"- all builtin modules support unicode\n", | |
"- open() takes encoding argument, like codecs.open() in python 2\n", | |
"- default encoding is utf-8 not ascii\n", | |
"- No str.decode(), bytes.encode()\n" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment