Last active
August 29, 2015 14:15
-
-
Save jimbaker/20a1917624e1dc25b768 to your computer and use it in GitHub Desktop.
Write Avro output file by appending. Modified from https://gist.github.com/esheffield/3514988368eb42db1203
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import string | |
import random | |
import json | |
import sys | |
import avro.schema | |
from avro.datafile import DataFileReader, DataFileWriter | |
from avro.io import DatumReader, DatumWriter | |
USER_COUNT = 10000 | |
META_SIZE = int(sys.argv[1]) | |
SIZES = ["LARGE", "MEDIUM", "SMALL"] | |
def strgen(size=6, chars=string.ascii_uppercase + string.digits): | |
return ''.join(random.choice(chars) for _ in range(size)) | |
def rand_str_or_none(size=6, none_chance=0.25): | |
# Kinda dump impl, but good enough | |
cutoff = 1000 * none_chance; | |
is_none = random.randint(0, 1000) <= cutoff | |
if is_none: | |
return None | |
else: | |
return strgen(size=size) | |
def rand_int_or_none(min=0, max=100, none_chance=0.25): | |
# Kinda dump impl, but good enough | |
cutoff = 1000 * none_chance; | |
is_none = random.randint(0, 1000) <= cutoff | |
if is_none: | |
return None | |
else: | |
return random.randint(min, max) | |
def make_user(meta_size=50): | |
user={ | |
"name": strgen(size=random.randint(6, 15)), | |
"favorite_number": rand_int_or_none(min=0, max=1000), | |
"favorite_color": rand_str_or_none(), | |
"size": SIZES[random.randint(0, 2)], | |
"meta": {}, | |
} | |
for _ in range(meta_size): | |
user["meta"][strgen()] = strgen(size=random.randint(6, 50)) | |
return user | |
print("Creating %d users with meta size %d" % (USER_COUNT, META_SIZE)) | |
start = datetime.datetime.now() | |
users = [] | |
for _ in range(USER_COUNT): | |
users.append(make_user(meta_size=META_SIZE)) | |
print("Done!") | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("***************") | |
print("Serializing with Avro...") | |
schema1 = avro.schema.parse(open("user.avsc").read()) | |
start = datetime.datetime.now() | |
with DataFileWriter(open("test_avro/users.avro", "wb"), DatumWriter(), schema1) as writer: | |
for user in users: | |
writer.append(user) | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("***************") | |
print("Serializing with JSON...") | |
start = datetime.datetime.now() | |
with open("test_avro/users.json", "w") as writer: | |
for user in users: | |
writer.write(json.dumps(user)) | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("Done!") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import hashlib | |
import json | |
import random | |
import string | |
import sys | |
from cStringIO import StringIO | |
import avro.schema | |
from avro.io import BinaryEncoder, DatumWriter | |
USER_COUNT = 10000 | |
META_SIZE = int(sys.argv[1]) | |
SIZES = ["LARGE", "MEDIUM", "SMALL"] | |
def strgen(size=6, chars=string.ascii_uppercase + string.digits): | |
return ''.join(random.choice(chars) for _ in range(size)) | |
def rand_str_or_none(size=6, none_chance=0.25): | |
# Kinda dump impl, but good enough | |
cutoff = 1000 * none_chance; | |
is_none = random.randint(0, 1000) <= cutoff | |
if is_none: | |
return None | |
else: | |
return strgen(size=size) | |
def rand_int_or_none(min=0, max=100, none_chance=0.25): | |
# Kinda dump impl, but good enough | |
cutoff = 1000 * none_chance; | |
is_none = random.randint(0, 1000) <= cutoff | |
if is_none: | |
return None | |
else: | |
return random.randint(min, max) | |
def make_user(meta_size=50): | |
user={ | |
"name": strgen(size=random.randint(6, 15)), | |
"favorite_number": rand_int_or_none(min=0, max=1000), | |
"favorite_color": rand_str_or_none(), | |
"size": SIZES[random.randint(0, 2)], | |
"meta": {} | |
} | |
for _ in range(meta_size): | |
user["meta"][strgen()] = strgen(size=random.randint(6, 50)) | |
return user | |
start = datetime.datetime.now() | |
print("Creating %d users with meta size %d" % (USER_COUNT, META_SIZE)) | |
users = [] | |
for _ in range(USER_COUNT): | |
users.append(make_user(meta_size=META_SIZE)) | |
print("Done!") | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("***************") | |
print("Serializing with Avro...") | |
with open("user.avsc") as f: | |
user_schema_text = f.read() | |
normalized_user_schema_text = json.dumps(json.loads(user_schema_text), separators=(',', ':')) | |
m = hashlib.md5() | |
m.update(normalized_user_schema_text) | |
schema_md5 = m.digest() | |
user_schema = avro.schema.parse(user_schema_text) | |
start = datetime.datetime.now() | |
datum_writer = DatumWriter() | |
datum_writer.writers_schema = user_schema | |
for user in users: | |
# we may be able to optimize python-kafka such that it looks like a file-like object; | |
# for now, it needs to be given a byte string (or perhaps something that supports memoryview) | |
buffer = StringIO() | |
buffer.write(chr(1)) # magic | |
buffer.write(schema_md5) | |
encoder = BinaryEncoder(buffer) | |
datum_writer.write(user, encoder) | |
buffer.getvalue() | |
# pretend to enqueue this value into Kafka - not measuring Kafka performance at this time | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("***************") | |
print("Serializing with JSON...") | |
start = datetime.datetime.now() | |
for user in users: | |
json.dumps(user) | |
# pretend to enqueue this value into Kafka | |
end = datetime.datetime.now() | |
print("Difference: %s" % (end - start)) | |
print("Done!") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(kafka)jimbaker:kafkadev jbaker$ python append-compare.py 50 | |
Creating 10000 users with meta size 50 | |
Done! | |
Difference: 0:00:14.947236 | |
*************** | |
Serializing with Avro... | |
Difference: 0:00:07.882527 | |
*************** | |
Serializing with JSON... | |
Difference: 0:00:00.327761 | |
Done! | |
(kafka)jimbaker:kafkadev jbaker$ python append-compare.py 0 | |
Creating 10000 users with meta size 0 | |
Done! | |
Difference: 0:00:00.252153 | |
*************** | |
Serializing with Avro... | |
Difference: 0:00:00.665084 | |
*************** | |
Serializing with JSON... | |
Difference: 0:00:00.068431 | |
Done! | |
(kafka)jimbaker:kafkadev jbaker$ python datum-writer-compare.py 50 | |
Creating 10000 users with meta size 50 | |
Done! | |
Difference: 0:00:15.320449 | |
*************** | |
Serializing with Avro... | |
Difference: 0:00:08.379228 | |
*************** | |
Serializing with JSON... | |
Difference: 0:00:00.237256 | |
Done! | |
(kafka)jimbaker:kafkadev jbaker$ python datum-writer-compare.py 0 | |
Creating 10000 users with meta size 0 | |
Done! | |
Difference: 0:00:00.256930 | |
*************** | |
Serializing with Avro... | |
Difference: 0:00:00.663297 | |
*************** | |
Serializing with JSON... | |
Difference: 0:00:00.059439 | |
Done! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment