dchentech · May 13, 2015 05:24
diff --git a/local_benchmark.py b/local_benchmark.py
 # -*-coding:utf-8-*-

 from etl_utils import process_notifier
 from collections import Counter, defaultdict
 import ujson
 null = None


 def char_count(func):
    result = defaultdict(int)
    root = "/home/primary_user/tmp/"
    f1 = root + "redmine10050_final_merge_range.json"  # 139.5 MB
    # f1 = root + "en_exam_20150429.json"  # 1.1 GB
    # f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt"
    for line in process_notifier(file(f1)):
        if isinstance(line, str):
            line = line.decode("UTF-8")
        d1 = line
        d1 = Counter(line.split(" "))
        # d1 = dict(Counter(list(line)))
        # d1 = func(d1)  # benchmark serialize and deserialize
        for k2, v2 in d1.iteritems():
            result[k2] += v2


 def python_func(d1):
    return eval(repr(d1))
    # return eval((d1))


 def ujson_func(d1):
    return ujson.loads(ujson.dumps(d1))
    # return ujson.loads(d1)


 print "** python_func"
 char_count(python_func)

 print "** ujson_func"
 char_count(ujson_func)


 # 139.5 MB
 # Python 3.3 MB/s
 # ujson  3.7 MB/s

 # 1.2 GB
 # Python 1.5 MB/s


 """
 ####### decode

 (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
 ** python_func
  {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s

 ** ujson_func
  {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s

 ####### word + decode + encode

 (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
 ** python_func
  {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s

 **
  {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s

 ####### char + decode + encode

 (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
 ** python_func
  {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s

 ** ujson_func
  {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s


 ####### char
  {pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s

 ####### word
  {pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s
 """
diff --git a/mapreduce_benchmark.py b/mapreduce_benchmark.py
 # -*-coding:utf-8-*-

 from __future__ import print_function

 from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop
 luigi.plug_packages("ujson", "jsonpickle")
 from collections import Counter


 class LargeFile(luigi.ExternalTask):

    def output(self):
        return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json")  # 133 MB
        # return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json")  # 1.1 GB
        # return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json")  # 16 GB


 class WordCountTemplate(TaskDayHadoop):

    root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange"
    n_reduce_tasks = 30

    def requires(self):
        self.serialize  # preload
        self.deserialize  # preload
        return LargeFile()

    def mapper(self, line):
        if isinstance(line, str):
            line = line.decode("UTF-8")
        d1 = Counter(list(line))  # too heavy CPU !!!
        # d1 = Counter(line.split(" "))
        d2 = dict(d1)  # convert to JSON format
        _ = len(d2)  # more partitioning
        yield _, d2

    def reducer(self, _, words_counters):
        result = defaultdict(int)
        for words_counter in words_counters:
            for word, count in words_counter.iteritems():
                result[word] += count
        result["serialize"] = str(self.serialize)
        result["deserialize"] = str(self.deserialize)
        yield "", MRUtils.str_dump(result)


 """
 ### Already warmed by run reading the input file.

 mapreduce.task.io.sort.mb == 512MB
 dfs.block.size == 128 MB
 n_reduce_tasks = 30

 CharCount + json 133 MB input file
 Python 48s
 JSON   50s

 CharCount + json 1.1 GB input file
 Python 2m:0s
 JSON   1m:30s

 CharCount + json 16 GB input file
 Python 3m:17s
 JSON   7m:13s

 ********************************

 CharCount + ujson 133 MB input file
 Python 52s
 JSON   51s

 CharCount + ujson 1.1 GB input file
 Python 2m:2s
 JSON   1m:18s

 CharCount + ujson 16 GB input file
 Python 3m:13s
 JSON   2m:42s

 ********************************

 WordCount 133 MB input file
 Python 30s
 JSON   30s

 WordCount 1.1 GB input file
 Python 1m:32s
 JSON   48s

 WordCount 16 GB input file
 Python 2m:22s
 JSON   1m:21s
 """

 class JsonBenchmarkDay(WordCountTemplate):

    data_interchange_format = "json"
    
 class PythonBenchmarkDay(WordCountTemplate):

    data_interchange_format = "python"
	# --coding:utf-8--

	from etl_utils import process_notifier
	from collections import Counter, defaultdict
	import ujson
	null = None


	def char_count(func):
	result = defaultdict(int)
	root = "/home/primary_user/tmp/"
	f1 = root + "redmine10050_final_merge_range.json" # 139.5 MB
	# f1 = root + "en_exam_20150429.json" # 1.1 GB
	# f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt"
	for line in process_notifier(file(f1)):
	if isinstance(line, str):
	line = line.decode("UTF-8")
	d1 = line
	d1 = Counter(line.split(" "))
	# d1 = dict(Counter(list(line)))
	# d1 = func(d1) # benchmark serialize and deserialize
	for k2, v2 in d1.iteritems():
	result[k2] += v2


	def python_func(d1):
	return eval(repr(d1))
	# return eval((d1))


	def ujson_func(d1):
	return ujson.loads(ujson.dumps(d1))
	# return ujson.loads(d1)


	print "** python_func"
	char_count(python_func)

	print "** ujson_func"
	char_count(ujson_func)


	# 139.5 MB
	# Python 3.3 MB/s
	# ujson 3.7 MB/s

	# 1.2 GB
	# Python 1.5 MB/s


	"""
	####### decode

	(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
	** python_func
	{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|##################################################################\| 7.5 MB/s

	** ujson_func
	{pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|#################################################################\| 70.1 MB/s

	####### word + decode + encode

	(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
	** python_func
	{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|##################################################################\| 9.6 MB/s

	**
	{pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|#################################################################\| 19.1 MB/s

	####### char + decode + encode

	(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
	** python_func
	{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|##################################################################\| 3.5 MB/s

	** ujson_func
	{pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% \|##################################################################\| 3.7 MB/s


	####### char
	{pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% \|########################################################################\| 3.8 MB/s

	####### word
	{pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% \|#######################################################################\| 21.8 MB/s
	"""
	# --coding:utf-8--

	from __future__ import print_function

	from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop
	luigi.plug_packages("ujson", "jsonpickle")
	from collections import Counter


	class LargeFile(luigi.ExternalTask):

	def output(self):
	return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json") # 133 MB
	# return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json") # 1.1 GB
	# return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json") # 16 GB


	class WordCountTemplate(TaskDayHadoop):

	root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange"
	n_reduce_tasks = 30

	def requires(self):
	self.serialize # preload
	self.deserialize # preload
	return LargeFile()

	def mapper(self, line):
	if isinstance(line, str):
	line = line.decode("UTF-8")
	d1 = Counter(list(line)) # too heavy CPU !!!
	# d1 = Counter(line.split(" "))
	d2 = dict(d1) # convert to JSON format
	_ = len(d2) # more partitioning
	yield _, d2

	def reducer(self, _, words_counters):
	result = defaultdict(int)
	for words_counter in words_counters:
	for word, count in words_counter.iteritems():
	result[word] += count
	result["serialize"] = str(self.serialize)
	result["deserialize"] = str(self.deserialize)
	yield "", MRUtils.str_dump(result)


	"""
	### Already warmed by run reading the input file.

	mapreduce.task.io.sort.mb == 512MB
	dfs.block.size == 128 MB
	n_reduce_tasks = 30

	CharCount + json 133 MB input file
	Python 48s
	JSON 50s

	CharCount + json 1.1 GB input file
	Python 2m:0s
	JSON 1m:30s

	CharCount + json 16 GB input file
	Python 3m:17s
	JSON 7m:13s

	********************************

	CharCount + ujson 133 MB input file
	Python 52s
	JSON 51s

	CharCount + ujson 1.1 GB input file
	Python 2m:2s
	JSON 1m:18s

	CharCount + ujson 16 GB input file
	Python 3m:13s
	JSON 2m:42s

	********************************

	WordCount 133 MB input file
	Python 30s
	JSON 30s

	WordCount 1.1 GB input file
	Python 1m:32s
	JSON 48s

	WordCount 16 GB input file
	Python 2m:22s
	JSON 1m:21s
	"""

	class JsonBenchmarkDay(WordCountTemplate):

	data_interchange_format = "json"

	class PythonBenchmarkDay(WordCountTemplate):

	data_interchange_format = "python"