cscorley · August 29, 2015 14:06
diff --git a/archlinux b/archlinux
 cscorley@geogaddi: ~
 % python2
 Python 2.7.8 (default, Jul  1 2014, 17:30:21)
 [GCC 4.9.0 20140604 (prerelease)] on linux2
 Type "help", "copyright", "credits" or "license" for more information.
 >>> s = u"test💩word"
 >>> s
 u'test\U0001f4a9word'
 >>> type(s)
 <type 'unicode'>
 >>> len(s)
 9
 >>> for each in s:
 ...    print(each)
 ...
 t
 e
 s
 t
 💩
 w
 o
 r
 d
 >>>

 cscorley@geogaddi: ~
 % locale
 LANG=en_US.utf8
 LC_CTYPE=en_US.utf8
 LC_NUMERIC="en_US.utf8"
 LC_TIME="en_US.utf8"
 LC_COLLATE="en_US.utf8"
 LC_MONETARY="en_US.utf8"
 LC_MESSAGES="en_US.utf8"
 LC_PAPER="en_US.utf8"
 LC_NAME="en_US.utf8"
 LC_ADDRESS="en_US.utf8"
 LC_TELEPHONE="en_US.utf8"
 LC_MEASUREMENT="en_US.utf8"
 LC_IDENTIFICATION="en_US.utf8"
 LC_ALL=
diff --git a/osx b/osx
 cscorley@bocmaxima: ~
 % python2
 Python 2.7.8 (default, Jul  2 2014, 10:14:46)
 [GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
 Type "help", "copyright", "credits" or "license" for more information.
 >>> s = u"test💩word"
 >>> s
 u'test\U0001f4a9word'
 >>> type(s)
 <type 'unicode'>
 >>> len(s)
 10
 >>> for each in s:
 ...    print(each)
 ...
 t
 e
 s
 t
 �
 �
 w
 o
 r
 d
 >>>

 cscorley@bocmaxima: ~
 % locale
 LANG="en_US.UTF-8"
 LC_COLLATE="en_US.UTF-8"
 LC_CTYPE="en_US.UTF-8"
 LC_MESSAGES="en_US.UTF-8"
 LC_MONETARY="en_US.UTF-8"
 LC_NUMERIC="en_US.UTF-8"
 LC_TIME="en_US.UTF-8"
 LC_ALL=
diff --git a/osx-withnormalize b/osx-withnormalize
 cscorley@bocmaxima: ~
 % python2
 Python 2.7.8 (default, Jul  2 2014, 10:14:46)
 [GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
 Type "help", "copyright", "credits" or "license" for more information.
 >>> s = u"test💩word"
 >>> import unicodedata
 >>> unicodedata.normalize('NFC', s)
 u'test\U0001f4a9word'
 >>> s
 u'test\U0001f4a9word'
 >>> len(unicodedata.normalize('NFC', s))
 10
 >>> for char in unicodedata.normalize('NFC', s):
 ...    print(char)
 ...
 t
 e
 s
 t
 �
 �
 w
 o
 r
 d
 >>> len(unicodedata.normalize('NFKC', s))
 10
 >>> len(unicodedata.normalize('NFD', s))
 10
 >>> len(unicodedata.normalize('NFKD', s))
 10
 >>>
	cscorley@geogaddi: ~
	% python2
	Python 2.7.8 (default, Jul 1 2014, 17:30:21)
	[GCC 4.9.0 20140604 (prerelease)] on linux2
	Type "help", "copyright", "credits" or "license" for more information.
	>>> s = u"test💩word"
	>>> s
	u'test\U0001f4a9word'
	>>> type(s)
	<type 'unicode'>
	>>> len(s)
	9
	>>> for each in s:
	... print(each)
	...
	t
	e
	s
	t
	💩
	w
	o
	r
	d
	>>>

	cscorley@geogaddi: ~
	% locale
	LANG=en_US.utf8
	LC_CTYPE=en_US.utf8
	LC_NUMERIC="en_US.utf8"
	LC_TIME="en_US.utf8"
	LC_COLLATE="en_US.utf8"
	LC_MONETARY="en_US.utf8"
	LC_MESSAGES="en_US.utf8"
	LC_PAPER="en_US.utf8"
	LC_NAME="en_US.utf8"
	LC_ADDRESS="en_US.utf8"
	LC_TELEPHONE="en_US.utf8"
	LC_MEASUREMENT="en_US.utf8"
	LC_IDENTIFICATION="en_US.utf8"
	LC_ALL=
	cscorley@bocmaxima: ~
	% python2
	Python 2.7.8 (default, Jul 2 2014, 10:14:46)
	[GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
	Type "help", "copyright", "credits" or "license" for more information.
	>>> s = u"test💩word"
	>>> s
	u'test\U0001f4a9word'
	>>> type(s)
	<type 'unicode'>
	>>> len(s)
	10
	>>> for each in s:
	... print(each)
	...
	t
	e
	s
	t
	�
	�
	w
	o
	r
	d
	>>>

	cscorley@bocmaxima: ~
	% locale
	LANG="en_US.UTF-8"
	LC_COLLATE="en_US.UTF-8"
	LC_CTYPE="en_US.UTF-8"
	LC_MESSAGES="en_US.UTF-8"
	LC_MONETARY="en_US.UTF-8"
	LC_NUMERIC="en_US.UTF-8"
	LC_TIME="en_US.UTF-8"
	LC_ALL=