Created
September 22, 2015 06:59
-
-
Save prashanthpai/0e75480abcf3349ab636 to your computer and use it in GitHub Desktop.
testMetadataLimit
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import uuid | |
import random | |
# Basics | |
# .encode('utf-8') converts unicode into byte sequence | |
# .decode('utf-8') converts byte sequence into unicode | |
# | |
# A unicode code point is encoded into a sequence of bytes. | |
# This encoded sequence can be one byte to six bytes long. | |
def create_ascii_name(): | |
return uuid.uuid4().hex | |
def create_utf8_name(): | |
length = 15 | |
utf8_chars = u'\uF10F\uD20D\uB30B\u9409\u8508\u5605\u3703\u1801'\ | |
u'\u0900\uF110\uD20E\uB30C\u940A\u8509\u5606\u3704'\ | |
u'\u1802\u0901\uF111\uD20F\uB30D\u940B\u850A\u5607'\ | |
u'\u3705\u1803\u0902\uF112\uD210\uB30E\u940C\u850B'\ | |
u'\u5608\u3706\u1804\u0903\u03A9\u2603' | |
return ''.join([random.choice(utf8_chars) | |
for x in xrange(length)]).encode('utf-8') | |
def test(): | |
i = 80 | |
j = 25 | |
metadata = {} | |
while len(metadata.keys()) < i: | |
key = create_ascii_name() | |
# This returns a valid utf8 character sequence | |
val = create_utf8_name() | |
if len(key) > j: | |
key = key[:j] | |
# This slicing that we do here will make the above valid | |
# 'utf8' byte sequence to be invalid. | |
# https://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences | |
# Example: | |
# Valid sequence before slicing: '\xe9\x90\x8b\xef\x84\x8f\xe5\x98\x86\xe5\x98\x85\xef\x84\x8f\xe8\x94\x8b\xe0\xa4\x83\xef\x84\x92\xe1\xa0\x82\xeb\x8c\x8b\xe0\xa4\x82\xe0\xa4\x80\xe9\x90\x89\xe5\x98\x87\xed\x88\x90' | |
# Invalid sequence after slicing: '\xe5\x98\x88\xe8\x94\x89\xed\x88\x8e\xeb\x8c\x8d\xe8\x94\x89\xeb\x8c\x8e\xeb\x8c\x8b\xe5\x98\x86\xe1' | |
orig_val = val | |
val = val[:j] | |
metadata[key] = val | |
print repr(orig_val) | |
print repr(val) | |
# The following will raise UnicodeDecodeError | |
_junk = json.dumps(metadata) | |
if __name__ == '__main__': | |
test() | |
~ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment