Skip to content

Instantly share code, notes, and snippets.

@prashanthpai
Created September 22, 2015 06:59
Show Gist options
  • Save prashanthpai/0e75480abcf3349ab636 to your computer and use it in GitHub Desktop.
Save prashanthpai/0e75480abcf3349ab636 to your computer and use it in GitHub Desktop.
testMetadataLimit
import json
import uuid
import random
# Basics
# .encode('utf-8') converts unicode into byte sequence
# .decode('utf-8') converts byte sequence into unicode
#
# A unicode code point is encoded into a sequence of bytes.
# This encoded sequence can be one byte to six bytes long.
def create_ascii_name():
return uuid.uuid4().hex
def create_utf8_name():
length = 15
utf8_chars = u'\uF10F\uD20D\uB30B\u9409\u8508\u5605\u3703\u1801'\
u'\u0900\uF110\uD20E\uB30C\u940A\u8509\u5606\u3704'\
u'\u1802\u0901\uF111\uD20F\uB30D\u940B\u850A\u5607'\
u'\u3705\u1803\u0902\uF112\uD210\uB30E\u940C\u850B'\
u'\u5608\u3706\u1804\u0903\u03A9\u2603'
return ''.join([random.choice(utf8_chars)
for x in xrange(length)]).encode('utf-8')
def test():
i = 80
j = 25
metadata = {}
while len(metadata.keys()) < i:
key = create_ascii_name()
# This returns a valid utf8 character sequence
val = create_utf8_name()
if len(key) > j:
key = key[:j]
# This slicing that we do here will make the above valid
# 'utf8' byte sequence to be invalid.
# https://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
# Example:
# Valid sequence before slicing: '\xe9\x90\x8b\xef\x84\x8f\xe5\x98\x86\xe5\x98\x85\xef\x84\x8f\xe8\x94\x8b\xe0\xa4\x83\xef\x84\x92\xe1\xa0\x82\xeb\x8c\x8b\xe0\xa4\x82\xe0\xa4\x80\xe9\x90\x89\xe5\x98\x87\xed\x88\x90'
# Invalid sequence after slicing: '\xe5\x98\x88\xe8\x94\x89\xed\x88\x8e\xeb\x8c\x8d\xe8\x94\x89\xeb\x8c\x8e\xeb\x8c\x8b\xe5\x98\x86\xe1'
orig_val = val
val = val[:j]
metadata[key] = val
print repr(orig_val)
print repr(val)
# The following will raise UnicodeDecodeError
_junk = json.dumps(metadata)
if __name__ == '__main__':
test()
~
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment