Created
June 13, 2015 18:15
-
-
Save skrul/0005d235d045ce41adb6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
print 'sys.maxunicode', sys.maxunicode | |
print 'internal string encoding is', sys.maxunicode > 65536 and 'UCS4' or 'UCS2' | |
# These bytes are a utf-8 encoded cake, see https://codepoints.net/U+1F370 | |
cake = '\xF0\x9F\x8D\xB0'.decode('utf-8') | |
print 'cake is', cake | |
# The byte-level representation is different for ucs2 vs. ucs4. | |
print 'cake string in bytes is', len(cake) | |
print 'hex value of first byte:', hex(ord(cake[0])) | |
print 'hex value of second byte:', len(cake) > 1 and hex(ord(cake[1])) or 'n/a' | |
# Here are two valid ways to put a cake literal into Python | |
# source code. | |
# You can use \U to reference a 32 byte (ucs4) code point. | |
cake_ucs4 = u'\U0001F370' | |
# Another way to do this is with a surrogate pair. Since this | |
# character is on a supplementary plane (whose codepoint can't | |
# fit in 16 bytes), there are some rules to split it into | |
# two 16 byte codepoints, see: | |
# https://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF | |
cake_ucs2 = u'\ud83c\udf70' | |
# While both cake_ucs4 and cake_ucs2 render a cake, their internal | |
# representations are different. | |
print 'cake_ucs2', cake_ucs2 | |
print 'len(cake_ucs2)', len(cake_ucs2) | |
print 'cake_ucs4', cake_ucs4 | |
print 'len(cake_ucs4)', len(cake_ucs4) | |
# Interestingly, on a ucs4 system, cake_ucs2 and cake_ucs4 keep | |
# the representation originally defined. However, on a ucs2 system, | |
# the 16 byte literal will be converted into the surrogate pair. | |
# So on a ucs4 system, cake_ucs2 and cake_ucs4 are not the same | |
# even though they refer to the same unicode code point. | |
print 'cake_ucs2 == cake_ucs4?', cake_ucs2 == cake_ucs4 | |
if len(cake_ucs4) > 1: | |
print '*** four byte literal converted to a surrogate pair' | |
# It is my understanding is that the regular expression engine | |
# is not really aware of unicode and will match things based | |
# on bytes. This means that the single code point cake will | |
# not match the surrogate pair cake! So when using unicode | |
# literals in a regular expression, the representation can | |
# matter. | |
print 'native regexp match?', re.compile(cake).match(cake) and 'yes' or 'no' | |
print 'ucs2 regexp match?', re.compile(cake_ucs2).match(cake) and 'yes' or 'no' | |
print 'ucs4 regexp match?', re.compile(cake_ucs4).match(cake) and 'yes' or 'no' | |
# This gets even more complex with regular expression character | |
# ranges. This does not compile on ucs2 systems: | |
try: | |
re.compile(u'[\U00010000-\U0001000f]') | |
print 'I can compile a 16 bit codepoint character range' | |
except: | |
print 'I can\'t compile a 16 bit codepoint character range :( :(' | |
# Why? Because a 16 bit codepoint unicode literal is implicitly | |
# converted to a surrogate pair before the string is passed to | |
# the regular expression engine. So on ucs2, the above character | |
# range looks like: | |
try: | |
re.compile(u'[\ud800\udc00-\ud800\udc0f]') | |
except: | |
print 'Nobody wants this' | |
# And this is just an invalid character range because the | |
# regular expression engine is dumb about unicode and thinks | |
# you want a range from 0xdc00 to 0xd800. This is invalid | |
# because it is a descending range, just like '[9-0]' is | |
# invalid. | |
# So despite the implicit conversion of 16 bit codepoint literals | |
# to surrogate pairs on ucs2 systems, using these literals in | |
# a regular expression character range won't do what you expect | |
# and may even produce an invalid regular expression. | |
# So when using a regular expression with a unicode character | |
# range, you should not rely on the implicit conversation and | |
# check sys.maxunicode and create two separate regular | |
# expressions. Note that the original nomoji code relied on | |
# the implicit conversion causing a exception to switch to the | |
# surrogate pair syntax -- I would suggest changing this to | |
# be more explicit. | |
# For the test strings embedded in code, I believe you can | |
# always use the 16 bit codepoint literals as they will | |
# be implicitly converted to a surrogate pair on ucs2 systems | |
# and therefore will match your regular expression that will | |
# be using the same. You should never use a surrogate pair | |
# since, on ucs4 systems, it will remain a surrogate pair | |
# and not match a regular expression built with 16 bit codepont | |
# literals. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment