prashanthpai · September 22, 2015 06:59
diff --git a/swift_bad_func_test.py b/swift_bad_func_test.py
 import json
 import uuid
 import random

 # Basics
 # .encode('utf-8') converts unicode into byte sequence
 # .decode('utf-8') converts byte sequence into unicode
 #
 # A unicode code point is encoded into a sequence of bytes.
 # This encoded sequence can be one byte to six bytes long.


 def create_ascii_name():
    return uuid.uuid4().hex


 def create_utf8_name():
    length = 15
    utf8_chars = u'\uF10F\uD20D\uB30B\u9409\u8508\u5605\u3703\u1801'\
                 u'\u0900\uF110\uD20E\uB30C\u940A\u8509\u5606\u3704'\
                 u'\u1802\u0901\uF111\uD20F\uB30D\u940B\u850A\u5607'\
                 u'\u3705\u1803\u0902\uF112\uD210\uB30E\u940C\u850B'\
                 u'\u5608\u3706\u1804\u0903\u03A9\u2603'
    return ''.join([random.choice(utf8_chars)
                    for x in xrange(length)]).encode('utf-8')


 def test():
    i = 80
    j = 25
    metadata = {}
    while len(metadata.keys()) < i:
        key = create_ascii_name()
        # This returns a valid utf8 character sequence
        val = create_utf8_name()

        if len(key) > j:
            key = key[:j]
            # This slicing that we do here will make the above valid
            # 'utf8' byte sequence to be invalid.
            # https://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
            # Example:
            # Valid sequence before slicing: '\xe9\x90\x8b\xef\x84\x8f\xe5\x98\x86\xe5\x98\x85\xef\x84\x8f\xe8\x94\x8b\xe0\xa4\x83\xef\x84\x92\xe1\xa0\x82\xeb\x8c\x8b\xe0\xa4\x82\xe0\xa4\x80\xe9\x90\x89\xe5\x98\x87\xed\x88\x90'
            # Invalid sequence after slicing: '\xe5\x98\x88\xe8\x94\x89\xed\x88\x8e\xeb\x8c\x8d\xe8\x94\x89\xeb\x8c\x8e\xeb\x8c\x8b\xe5\x98\x86\xe1'
            orig_val = val
            val = val[:j]

        metadata[key] = val

    print repr(orig_val)
    print repr(val)
    # The following will raise UnicodeDecodeError
    _junk = json.dumps(metadata)


 if __name__ == '__main__':
    test()
 ~
	import json
	import uuid
	import random

	# Basics
	# .encode('utf-8') converts unicode into byte sequence
	# .decode('utf-8') converts byte sequence into unicode
	#
	# A unicode code point is encoded into a sequence of bytes.
	# This encoded sequence can be one byte to six bytes long.


	def create_ascii_name():
	return uuid.uuid4().hex


	def create_utf8_name():
	length = 15
	utf8_chars = u'\uF10F\uD20D\uB30B\u9409\u8508\u5605\u3703\u1801'\
	u'\u0900\uF110\uD20E\uB30C\u940A\u8509\u5606\u3704'\
	u'\u1802\u0901\uF111\uD20F\uB30D\u940B\u850A\u5607'\
	u'\u3705\u1803\u0902\uF112\uD210\uB30E\u940C\u850B'\
	u'\u5608\u3706\u1804\u0903\u03A9\u2603'
	return ''.join([random.choice(utf8_chars)
	for x in xrange(length)]).encode('utf-8')


	def test():
	i = 80
	j = 25
	metadata = {}
	while len(metadata.keys()) < i:
	key = create_ascii_name()
	# This returns a valid utf8 character sequence
	val = create_utf8_name()

	if len(key) > j:
	key = key[:j]
	# This slicing that we do here will make the above valid
	# 'utf8' byte sequence to be invalid.
	# https://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
	# Example:
	# Valid sequence before slicing: '\xe9\x90\x8b\xef\x84\x8f\xe5\x98\x86\xe5\x98\x85\xef\x84\x8f\xe8\x94\x8b\xe0\xa4\x83\xef\x84\x92\xe1\xa0\x82\xeb\x8c\x8b\xe0\xa4\x82\xe0\xa4\x80\xe9\x90\x89\xe5\x98\x87\xed\x88\x90'
	# Invalid sequence after slicing: '\xe5\x98\x88\xe8\x94\x89\xed\x88\x8e\xeb\x8c\x8d\xe8\x94\x89\xeb\x8c\x8e\xeb\x8c\x8b\xe5\x98\x86\xe1'
	orig_val = val
	val = val[:j]

	metadata[key] = val

	print repr(orig_val)
	print repr(val)
	# The following will raise UnicodeDecodeError
	_junk = json.dumps(metadata)


	if __name__ == '__main__':
	test()
	~