Skip to content

Instantly share code, notes, and snippets.

@dchentech
Created May 13, 2015 05:24
Show Gist options
  • Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.
Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.
JSON VS eval performance benchmark, see requirement at https://github.com/spotify/luigi/pull/939
# -*-coding:utf-8-*-
from etl_utils import process_notifier
from collections import Counter, defaultdict
import ujson
null = None
def char_count(func):
result = defaultdict(int)
root = "/home/primary_user/tmp/"
f1 = root + "redmine10050_final_merge_range.json" # 139.5 MB
# f1 = root + "en_exam_20150429.json" # 1.1 GB
# f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt"
for line in process_notifier(file(f1)):
if isinstance(line, str):
line = line.decode("UTF-8")
d1 = line
d1 = Counter(line.split(" "))
# d1 = dict(Counter(list(line)))
# d1 = func(d1) # benchmark serialize and deserialize
for k2, v2 in d1.iteritems():
result[k2] += v2
def python_func(d1):
return eval(repr(d1))
# return eval((d1))
def ujson_func(d1):
return ujson.loads(ujson.dumps(d1))
# return ujson.loads(d1)
print "** python_func"
char_count(python_func)
print "** ujson_func"
char_count(ujson_func)
# 139.5 MB
# Python 3.3 MB/s
# ujson 3.7 MB/s
# 1.2 GB
# Python 1.5 MB/s
"""
####### decode
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
** python_func
{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s
** ujson_func
{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s
####### word + decode + encode
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
** python_func
{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s
**
{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s
####### char + decode + encode
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
** python_func
{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s
** ujson_func
{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s
####### char
{pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s
####### word
{pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s
"""
# -*-coding:utf-8-*-
from __future__ import print_function
from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop
luigi.plug_packages("ujson", "jsonpickle")
from collections import Counter
class LargeFile(luigi.ExternalTask):
def output(self):
return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json") # 133 MB
# return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json") # 1.1 GB
# return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json") # 16 GB
class WordCountTemplate(TaskDayHadoop):
root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange"
n_reduce_tasks = 30
def requires(self):
self.serialize # preload
self.deserialize # preload
return LargeFile()
def mapper(self, line):
if isinstance(line, str):
line = line.decode("UTF-8")
d1 = Counter(list(line)) # too heavy CPU !!!
# d1 = Counter(line.split(" "))
d2 = dict(d1) # convert to JSON format
_ = len(d2) # more partitioning
yield _, d2
def reducer(self, _, words_counters):
result = defaultdict(int)
for words_counter in words_counters:
for word, count in words_counter.iteritems():
result[word] += count
result["serialize"] = str(self.serialize)
result["deserialize"] = str(self.deserialize)
yield "", MRUtils.str_dump(result)
"""
### Already warmed by run reading the input file.
mapreduce.task.io.sort.mb == 512MB
dfs.block.size == 128 MB
n_reduce_tasks = 30
CharCount + json 133 MB input file
Python 48s
JSON 50s
CharCount + json 1.1 GB input file
Python 2m:0s
JSON 1m:30s
CharCount + json 16 GB input file
Python 3m:17s
JSON 7m:13s
********************************
CharCount + ujson 133 MB input file
Python 52s
JSON 51s
CharCount + ujson 1.1 GB input file
Python 2m:2s
JSON 1m:18s
CharCount + ujson 16 GB input file
Python 3m:13s
JSON 2m:42s
********************************
WordCount 133 MB input file
Python 30s
JSON 30s
WordCount 1.1 GB input file
Python 1m:32s
JSON 48s
WordCount 16 GB input file
Python 2m:22s
JSON 1m:21s
"""
class JsonBenchmarkDay(WordCountTemplate):
data_interchange_format = "json"
class PythonBenchmarkDay(WordCountTemplate):
data_interchange_format = "python"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment