Created
May 13, 2015 05:24
-
-
Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.
JSON VS eval performance benchmark, see requirement at https://github.com/spotify/luigi/pull/939
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*-coding:utf-8-*- | |
from etl_utils import process_notifier | |
from collections import Counter, defaultdict | |
import ujson | |
null = None | |
def char_count(func): | |
result = defaultdict(int) | |
root = "/home/primary_user/tmp/" | |
f1 = root + "redmine10050_final_merge_range.json" # 139.5 MB | |
# f1 = root + "en_exam_20150429.json" # 1.1 GB | |
# f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt" | |
for line in process_notifier(file(f1)): | |
if isinstance(line, str): | |
line = line.decode("UTF-8") | |
d1 = line | |
d1 = Counter(line.split(" ")) | |
# d1 = dict(Counter(list(line))) | |
# d1 = func(d1) # benchmark serialize and deserialize | |
for k2, v2 in d1.iteritems(): | |
result[k2] += v2 | |
def python_func(d1): | |
return eval(repr(d1)) | |
# return eval((d1)) | |
def ujson_func(d1): | |
return ujson.loads(ujson.dumps(d1)) | |
# return ujson.loads(d1) | |
print "** python_func" | |
char_count(python_func) | |
print "** ujson_func" | |
char_count(ujson_func) | |
# 139.5 MB | |
# Python 3.3 MB/s | |
# ujson 3.7 MB/s | |
# 1.2 GB | |
# Python 1.5 MB/s | |
""" | |
####### decode | |
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py | |
** python_func | |
{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s | |
** ujson_func | |
{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s | |
####### word + decode + encode | |
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py | |
** python_func | |
{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s | |
** | |
{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s | |
####### char + decode + encode | |
(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py | |
** python_func | |
{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s | |
** ujson_func | |
{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s | |
####### char | |
{pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s | |
####### word | |
{pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*-coding:utf-8-*- | |
from __future__ import print_function | |
from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop | |
luigi.plug_packages("ujson", "jsonpickle") | |
from collections import Counter | |
class LargeFile(luigi.ExternalTask): | |
def output(self): | |
return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json") # 133 MB | |
# return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json") # 1.1 GB | |
# return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json") # 16 GB | |
class WordCountTemplate(TaskDayHadoop): | |
root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange" | |
n_reduce_tasks = 30 | |
def requires(self): | |
self.serialize # preload | |
self.deserialize # preload | |
return LargeFile() | |
def mapper(self, line): | |
if isinstance(line, str): | |
line = line.decode("UTF-8") | |
d1 = Counter(list(line)) # too heavy CPU !!! | |
# d1 = Counter(line.split(" ")) | |
d2 = dict(d1) # convert to JSON format | |
_ = len(d2) # more partitioning | |
yield _, d2 | |
def reducer(self, _, words_counters): | |
result = defaultdict(int) | |
for words_counter in words_counters: | |
for word, count in words_counter.iteritems(): | |
result[word] += count | |
result["serialize"] = str(self.serialize) | |
result["deserialize"] = str(self.deserialize) | |
yield "", MRUtils.str_dump(result) | |
""" | |
### Already warmed by run reading the input file. | |
mapreduce.task.io.sort.mb == 512MB | |
dfs.block.size == 128 MB | |
n_reduce_tasks = 30 | |
CharCount + json 133 MB input file | |
Python 48s | |
JSON 50s | |
CharCount + json 1.1 GB input file | |
Python 2m:0s | |
JSON 1m:30s | |
CharCount + json 16 GB input file | |
Python 3m:17s | |
JSON 7m:13s | |
******************************** | |
CharCount + ujson 133 MB input file | |
Python 52s | |
JSON 51s | |
CharCount + ujson 1.1 GB input file | |
Python 2m:2s | |
JSON 1m:18s | |
CharCount + ujson 16 GB input file | |
Python 3m:13s | |
JSON 2m:42s | |
******************************** | |
WordCount 133 MB input file | |
Python 30s | |
JSON 30s | |
WordCount 1.1 GB input file | |
Python 1m:32s | |
JSON 48s | |
WordCount 16 GB input file | |
Python 2m:22s | |
JSON 1m:21s | |
""" | |
class JsonBenchmarkDay(WordCountTemplate): | |
data_interchange_format = "json" | |
class PythonBenchmarkDay(WordCountTemplate): | |
data_interchange_format = "python" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment