wolever · August 29, 2015 14:22
diff --git a/str_format_will_crash_your_app.py b/str_format_will_crash_your_app.py
 # Python 2's %-formatting will "upgrade" the format string to unicode if an
 # argument is unicode, where str.format(…) will downgrade unicode arguments to
 # bytes. This leads to unnecessarily fragile code, as very small programmer
 # mistakes can cause show-stopping Unicode-related exceptions.

 # Consider the simplest "hello world" of string formatting with:
 >>> name = u"Aléx ✨"

 # Using %-formatting:
 >>> "Hello, %s!" %(name, )
 u'Hello, Aléx ✨!'

 # Using str.format:
 >>> "Hello, {}!".format(name)
 ...
 UnicodeEncodeError: 'ascii' codec cant encode character u'\xe9' in position 2: ordinal not in range(128)

 # Of course, this can be "solved" by remembering to prefix every string with 'u':
 >>> u"Hello, {}!".format(name)
 u'Hello, Aléx ✨!'

 # But not only is this ugly, it's also error prone, and there's nothing static
 # analyzers (like the fantastically useful flake8) can do to help, because
 # unicode.format(...) will try to convert all of its arguments to unicode:
 >>> u"Some bytes: {}".format("\xfa\xfb\xfc")
 ...
 UnicodeDecodeError: 'ascii' codec cant decode byte 0xfa in position 0: ordinal not in range(128)

 # Contrast, again, with the more sensible behaviour of %-formatting:
 >>> "Some bytes: %s" %("\xfa\xfb\xfc", )
 'Some bytes: \xfa\xfb\xfc'

 # %-formatting will only cause encoding/decoding error if non-ASCII bytes and
 # unicode characters are mixed (which is reasonable, since there is no obvious
 # "right answer" in this situation):
 >>> "Hello, %s! Here are some bytes: %s" %(name, "\xfa\xfb\xfc")
 ...
 UnicodeDecodeError: 'ascii' codec cant decode byte 0xfa in position 0: ordinal not in range(128)


 # Now, some will argue that you are an irresponsible programmer if you don't
 # know the specific type of every variable, and it's Just Your Job to prefix
 # every format string with 'u'. But there are two reasons you shouldn't listen
 # to those people:
 #
 # 1) The str/unicode situation in Python 2 is absurd. In my experience it's
 #    virtually impossible to know with 100% certainty whether a "string-like"
 #    thing is a ``str`` or a ``unicode`` (especially when 3rd party libraries
 #    are being used), and it usually makes zero practical difference (see
 #    footnote).
 #
 # 2) Imagine that we were discussing numbers instead of strings. It would be
 #    frustrating and silly if ``x + 1.5`` failed if ``x`` was an ``int``.
 #    Why should strings be any different?
 #
 # Footnote: there is usually zero practical difference between str and unicode
 # for string-like data because the distinction only becomes relevant when it
 # comes time to turn the string into bytes (ex, save it to a file or send it to
 # a web browser), and that can be addressed with a ``to_str``:

 def to_str(obj, encoding='utf-8'):
    """ Converts ``obj`` to a ``str`` in the most sensible way possible.

        >>> to_str("\xff")
        '\xff'
        >>> to_str(u"\u1234")
        '\xe1\x88\xb4'
        >>> to_str([1234])
        '[1234]'

        From unstdlib.py
        https://github.com/shazow/unstdlib.py/blob/master/unstdlib/standard/string_.py#L196
    """
    if isinstance(obj, str):
        return obj
    if isinstance(obj, unicode) or hasattr(obj, '__unicode__'):
        return text_type(obj).encode(encoding)
    return str(obj)
	# Python 2's %-formatting will "upgrade" the format string to unicode if an
	# argument is unicode, where str.format(…) will downgrade unicode arguments to
	# bytes. This leads to unnecessarily fragile code, as very small programmer
	# mistakes can cause show-stopping Unicode-related exceptions.

	# Consider the simplest "hello world" of string formatting with:
	>>> name = u"Aléx ✨"

	# Using %-formatting:
	>>> "Hello, %s!" %(name, )
	u'Hello, Aléx ✨!'

	# Using str.format:
	>>> "Hello, {}!".format(name)
	...
	UnicodeEncodeError: 'ascii' codec cant encode character u'\xe9' in position 2: ordinal not in range(128)

	# Of course, this can be "solved" by remembering to prefix every string with 'u':
	>>> u"Hello, {}!".format(name)
	u'Hello, Aléx ✨!'

	# But not only is this ugly, it's also error prone, and there's nothing static
	# analyzers (like the fantastically useful flake8) can do to help, because
	# unicode.format(...) will try to convert all of its arguments to unicode:
	>>> u"Some bytes: {}".format("\xfa\xfb\xfc")
	...
	UnicodeDecodeError: 'ascii' codec cant decode byte 0xfa in position 0: ordinal not in range(128)

	# Contrast, again, with the more sensible behaviour of %-formatting:
	>>> "Some bytes: %s" %("\xfa\xfb\xfc", )
	'Some bytes: \xfa\xfb\xfc'

	# %-formatting will only cause encoding/decoding error if non-ASCII bytes and
	# unicode characters are mixed (which is reasonable, since there is no obvious
	# "right answer" in this situation):
	>>> "Hello, %s! Here are some bytes: %s" %(name, "\xfa\xfb\xfc")
	...
	UnicodeDecodeError: 'ascii' codec cant decode byte 0xfa in position 0: ordinal not in range(128)


	# Now, some will argue that you are an irresponsible programmer if you don't
	# know the specific type of every variable, and it's Just Your Job to prefix
	# every format string with 'u'. But there are two reasons you shouldn't listen
	# to those people:
	#
	# 1) The str/unicode situation in Python 2 is absurd. In my experience it's
	# virtually impossible to know with 100% certainty whether a "string-like"
	# thing is a ``str`` or a ``unicode`` (especially when 3rd party libraries
	# are being used), and it usually makes zero practical difference (see
	# footnote).
	#
	# 2) Imagine that we were discussing numbers instead of strings. It would be
	# frustrating and silly if ``x + 1.5`` failed if ``x`` was an ``int``.
	# Why should strings be any different?
	#
	# Footnote: there is usually zero practical difference between str and unicode
	# for string-like data because the distinction only becomes relevant when it
	# comes time to turn the string into bytes (ex, save it to a file or send it to
	# a web browser), and that can be addressed with a ``to_str``:

	def to_str(obj, encoding='utf-8'):
	""" Converts ``obj`` to a ``str`` in the most sensible way possible.

	>>> to_str("\xff")
	'\xff'
	>>> to_str(u"\u1234")
	'\xe1\x88\xb4'
	>>> to_str([1234])
	'[1234]'

	From unstdlib.py
	https://github.com/shazow/unstdlib.py/blob/master/unstdlib/standard/string_.py#L196
	"""
	if isinstance(obj, str):
	return obj
	if isinstance(obj, unicode) or hasattr(obj, '__unicode__'):
	return text_type(obj).encode(encoding)
	return str(obj)