Skip to content

Instantly share code, notes, and snippets.

@jimbaker
Last active August 29, 2015 14:15
Show Gist options
  • Save jimbaker/20a1917624e1dc25b768 to your computer and use it in GitHub Desktop.
Save jimbaker/20a1917624e1dc25b768 to your computer and use it in GitHub Desktop.
Write Avro output file by appending. Modified from https://gist.github.com/esheffield/3514988368eb42db1203
import datetime
import string
import random
import json
import sys
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
USER_COUNT = 10000
META_SIZE = int(sys.argv[1])
SIZES = ["LARGE", "MEDIUM", "SMALL"]
def strgen(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def rand_str_or_none(size=6, none_chance=0.25):
# Kinda dump impl, but good enough
cutoff = 1000 * none_chance;
is_none = random.randint(0, 1000) <= cutoff
if is_none:
return None
else:
return strgen(size=size)
def rand_int_or_none(min=0, max=100, none_chance=0.25):
# Kinda dump impl, but good enough
cutoff = 1000 * none_chance;
is_none = random.randint(0, 1000) <= cutoff
if is_none:
return None
else:
return random.randint(min, max)
def make_user(meta_size=50):
user={
"name": strgen(size=random.randint(6, 15)),
"favorite_number": rand_int_or_none(min=0, max=1000),
"favorite_color": rand_str_or_none(),
"size": SIZES[random.randint(0, 2)],
"meta": {},
}
for _ in range(meta_size):
user["meta"][strgen()] = strgen(size=random.randint(6, 50))
return user
print("Creating %d users with meta size %d" % (USER_COUNT, META_SIZE))
start = datetime.datetime.now()
users = []
for _ in range(USER_COUNT):
users.append(make_user(meta_size=META_SIZE))
print("Done!")
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("***************")
print("Serializing with Avro...")
schema1 = avro.schema.parse(open("user.avsc").read())
start = datetime.datetime.now()
with DataFileWriter(open("test_avro/users.avro", "wb"), DatumWriter(), schema1) as writer:
for user in users:
writer.append(user)
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("***************")
print("Serializing with JSON...")
start = datetime.datetime.now()
with open("test_avro/users.json", "w") as writer:
for user in users:
writer.write(json.dumps(user))
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("Done!")
import datetime
import hashlib
import json
import random
import string
import sys
from cStringIO import StringIO
import avro.schema
from avro.io import BinaryEncoder, DatumWriter
USER_COUNT = 10000
META_SIZE = int(sys.argv[1])
SIZES = ["LARGE", "MEDIUM", "SMALL"]
def strgen(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def rand_str_or_none(size=6, none_chance=0.25):
# Kinda dump impl, but good enough
cutoff = 1000 * none_chance;
is_none = random.randint(0, 1000) <= cutoff
if is_none:
return None
else:
return strgen(size=size)
def rand_int_or_none(min=0, max=100, none_chance=0.25):
# Kinda dump impl, but good enough
cutoff = 1000 * none_chance;
is_none = random.randint(0, 1000) <= cutoff
if is_none:
return None
else:
return random.randint(min, max)
def make_user(meta_size=50):
user={
"name": strgen(size=random.randint(6, 15)),
"favorite_number": rand_int_or_none(min=0, max=1000),
"favorite_color": rand_str_or_none(),
"size": SIZES[random.randint(0, 2)],
"meta": {}
}
for _ in range(meta_size):
user["meta"][strgen()] = strgen(size=random.randint(6, 50))
return user
start = datetime.datetime.now()
print("Creating %d users with meta size %d" % (USER_COUNT, META_SIZE))
users = []
for _ in range(USER_COUNT):
users.append(make_user(meta_size=META_SIZE))
print("Done!")
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("***************")
print("Serializing with Avro...")
with open("user.avsc") as f:
user_schema_text = f.read()
normalized_user_schema_text = json.dumps(json.loads(user_schema_text), separators=(',', ':'))
m = hashlib.md5()
m.update(normalized_user_schema_text)
schema_md5 = m.digest()
user_schema = avro.schema.parse(user_schema_text)
start = datetime.datetime.now()
datum_writer = DatumWriter()
datum_writer.writers_schema = user_schema
for user in users:
# we may be able to optimize python-kafka such that it looks like a file-like object;
# for now, it needs to be given a byte string (or perhaps something that supports memoryview)
buffer = StringIO()
buffer.write(chr(1)) # magic
buffer.write(schema_md5)
encoder = BinaryEncoder(buffer)
datum_writer.write(user, encoder)
buffer.getvalue()
# pretend to enqueue this value into Kafka - not measuring Kafka performance at this time
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("***************")
print("Serializing with JSON...")
start = datetime.datetime.now()
for user in users:
json.dumps(user)
# pretend to enqueue this value into Kafka
end = datetime.datetime.now()
print("Difference: %s" % (end - start))
print("Done!")
(kafka)jimbaker:kafkadev jbaker$ python append-compare.py 50
Creating 10000 users with meta size 50
Done!
Difference: 0:00:14.947236
***************
Serializing with Avro...
Difference: 0:00:07.882527
***************
Serializing with JSON...
Difference: 0:00:00.327761
Done!
(kafka)jimbaker:kafkadev jbaker$ python append-compare.py 0
Creating 10000 users with meta size 0
Done!
Difference: 0:00:00.252153
***************
Serializing with Avro...
Difference: 0:00:00.665084
***************
Serializing with JSON...
Difference: 0:00:00.068431
Done!
(kafka)jimbaker:kafkadev jbaker$ python datum-writer-compare.py 50
Creating 10000 users with meta size 50
Done!
Difference: 0:00:15.320449
***************
Serializing with Avro...
Difference: 0:00:08.379228
***************
Serializing with JSON...
Difference: 0:00:00.237256
Done!
(kafka)jimbaker:kafkadev jbaker$ python datum-writer-compare.py 0
Creating 10000 users with meta size 0
Done!
Difference: 0:00:00.256930
***************
Serializing with Avro...
Difference: 0:00:00.663297
***************
Serializing with JSON...
Difference: 0:00:00.059439
Done!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment