Last active
October 22, 2018 13:32
-
-
Save GustavoCaso/524861c48e8b2f7b9b8ec3fffba24190 to your computer and use it in GitHub Desktop.
Benchmark for Analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/inline' | |
require 'benchmark' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'pry' | |
end | |
number_of_job_payload = ARGV[0].to_i || 100_000 | |
class Base | |
def initialize(where_conditions: {}, group_by: %w[job_class shop_id api_client_id target_hostname]) | |
@where_conditions = where_conditions | |
@group_by = group_by | |
@counts_per_attribute = Hash.new { |hash, key| hash[key] = Hash.new(0) } | |
end | |
def process_payloads(batches) | |
batches.each do |job_payloads| | |
process(job_payloads) | |
end | |
end | |
def process(job_payloads) | |
group_payloads_attributes_by_job(job_payloads) do |job_payload_attributes| | |
job_payload_attributes.each do |attribute_key, attribute_value| | |
sanitized_attribute_value = attribute_value.delete('"') | |
if @where_conditions[attribute_key] | |
break unless @where_conditions[attribute_key].include?(sanitized_attribute_value) | |
end | |
@counts_per_attribute[attribute_key][sanitized_attribute_value] += 1 | |
end | |
end | |
end | |
def group_payloads_attributes_by_job(job_payloads) | |
raise NotImplementedError | |
end | |
def pattern_to_extract_attributes | |
if @group_by.include?("shop_id") | |
group_by_dup = @group_by.clone | |
group_by_dup.delete("shop_id") | |
Regexp.union(shop_id_regex, attributes_regex(group_by_dup)) | |
else | |
attributes_regex(@group_by) | |
end | |
end | |
def shop_id_regex | |
/arguments":\[{\"(shop_id)\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
def attributes_regex(attributes) | |
/\"(#{Regexp.union(*attributes)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
class AnalyzerGlobalScan < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_payloads_attributes = batch_of_payloads.scan(pattern_to_extract_attributes).flatten.compact.each_slice(2).to_a | |
number_to_get_exact_payloads = batch_payloads_attributes.size / job_payloads.size | |
batch_payloads_attributes.each_slice(number_to_get_exact_payloads).to_a.each do |job_payload| | |
yield job_payload | |
end | |
end | |
end | |
class AnalyzerEachScan < Base | |
def group_payloads_attributes_by_job(payloads) | |
payloads.each do |payload| | |
scanned_results = payload.scan(pattern_to_extract_attributes) | |
# We need to flatten to being able to remove the duplicate shop_id | |
# attribute. | |
# Then we group them by two (attribute_key, attribute_value) | |
yield scanned_results.flatten.compact.each_slice(2).to_a | |
end | |
end | |
end | |
class AnalyzerGlobalScanCustomIteration < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes| | |
cleaned_attribute = attribute_key_value.compact | |
if cleaned_attribute.include?('job_class') | |
attributes = attributes << [] | |
attributes.last << cleaned_attribute | |
else | |
attributes.last << cleaned_attribute | |
end | |
end.each do |job_payload| | |
yield job_payload | |
end | |
end | |
end | |
class AnalyzerGlobalScanCustomIterationBasicRegex < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes| | |
# we group by job_class because we know each job payload | |
# have a job_class | |
if attribute_key_value.include?('job_class') | |
attributes = attributes << [] | |
attributes.last << attribute_key_value | |
else | |
# This will solve the issue with multiple shop_id | |
next if attributes.last.include?(attribute_key_value) | |
attributes.last << attribute_key_value | |
end | |
end.each do |job_payload| | |
yield job_payload | |
end | |
end | |
def pattern_to_extract_attributes | |
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
class AnalyzerSplitByJobClass < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).join(' ').split(/(?=job_class)/).each do |attributes| | |
yield attributes.split.uniq.each_slice(2) | |
end | |
end | |
def pattern_to_extract_attributes | |
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
JOB_CLASSES = %w[ | |
Appscale::Jobs::AnalyzerTest::WebhookQueueJob | |
Appscale::Jobs::AnalyzerTest::Whatever | |
Appscale::Jobs::AnalyzerTest::ILikeThisOne | |
Appscale::Jobs::AnalyzerTest::ImBackBaby | |
] | |
def generate_job_payload(job_class) | |
"{\"class\":\"#{job_class}\",\"args\":[{\"job_class\":\"#{job_class}\",\"job_id\":\"657a094f-d9ee-4f0a-92ea-bf8314482390\",\"provider_job_id\":null,\"queue_name\":\"webhook\",\"priority\":null,\"arguments\":[{\"shop_id\":690933842,\"_aj_symbol_keys\":[\"shop_id\"]}],\"executions\":0,\"locale\":\"en\",\"log_level\":0,\"attempt\":0,\"request_id\":null,\"queue_start\":1540041993.7910662,\"expected_run_time\":1540041993.791,\"pod_id\":0,\"privacy_level\":null,\"feature_set\":null,\"shop_id\":690933842,\"queued_by_shopify_version\":\"0aec3a435b6de9f4da41bc511e2257727f4cf6ef\",\"queued_by_section\":\"NilSectionGlobals\",\"queued_with_readonly_master\":false}]}" | |
end | |
def genarate_job_payloads(number_of_job_payloads) | |
[].tap do |array| | |
number_of_job_payloads.times do | |
array << generate_job_payload(JOB_CLASSES.sample) | |
end | |
end | |
end | |
batches = [ | |
genarate_job_payloads(number_of_job_payload), | |
genarate_job_payloads(number_of_job_payload), | |
] | |
global_scan = AnalyzerGlobalScan.new | |
global_scan_custom_iteration = AnalyzerGlobalScanCustomIteration.new | |
global_scan_custom_iteration_basic_regex = AnalyzerGlobalScanCustomIterationBasicRegex.new | |
split_by_job_class = AnalyzerSplitByJobClass.new | |
each_scan = AnalyzerEachScan.new | |
Benchmark.bmbm(28) do |x| | |
x.report('global_scan:') { global_scan.process_payloads(batches) } | |
x.report('global_scan_custom_iteration:') { global_scan_custom_iteration.process_payloads(batches) } | |
x.report('global_scan_custom_iteration_basic_regex:') { global_scan_custom_iteration_basic_regex.process_payloads(batches) } | |
x.report('split_by_job_class:') { split_by_job_class.process_payloads(batches) } | |
x.report('each_scan:') { each_scan.process_payloads(batches) } | |
end |
I have done more experiments and looks like we can improve a little by using the basic regex /\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
that was giving as some issues with multiple shop_id
attributes, but doing the aggregation with the custom iteration.
global_scan_custom_iteration_basic_regex
is the winner 🎉 🎉
Rehearsal -----------------------------------------------------------------------------
global_scan: 0.166552 0.007628 0.174180 ( 0.174279)
global_scan_custom_iteration: 0.168898 0.002649 0.171547 ( 0.171596)
global_scan_custom_iteration_basic_regex: 0.142025 0.001113 0.143138 ( 0.143178)
split_by_job_class: 0.188039 0.003317 0.191356 ( 0.191373)
each_scan: 1.065555 0.014591 1.080146 ( 1.080628)
-------------------------------------------------------------------- total: 1.760367sec
user system total real
global_scan: 0.159464 0.001663 0.161127 ( 0.161351)
global_scan_custom_iteration: 0.159547 0.001340 0.160887 ( 0.160950)
global_scan_custom_iteration_basic_regex: 0.140358 0.000836 0.141194 ( 0.141362)
split_by_job_class: 0.185607 0.002132 0.187739 ( 0.187775)
each_scan: 1.059577 0.009298 1.068875 ( 1.069279)
After talking with Moe, he pointed out that some method names are not very descriptive.
Also, know you can invoke the benchmark passing the number of job payloads that you want to execute.
ruby benchmark.rb 10000
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have created three variations:
AnalyzerGlobalScan
does a global scan and them group by job payload using multipleeach_slice
.AnalyzerEachScan
it iterates over each job payload and performing the scan on each one of them, the code might be cleaner since we no need to create the job payload, but the performance is worst due to multiplescan
.AnalyzerGlobalScanCustomIteration
does a global scan but instead of doing multipleeach_slice
it groups using custom logic, making the code read much better thanAnalyzerGlobalScan
.Running the benchmark, it would print the output:
Proving the
each_scan
is the slowest.But
global_scan_custom_iteration
is almost as fast asglobal_scan
, and the code is much more comfortable the read