Last active
October 22, 2018 13:32
-
-
Save GustavoCaso/524861c48e8b2f7b9b8ec3fffba24190 to your computer and use it in GitHub Desktop.
Benchmark for Analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/inline' | |
require 'benchmark' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'pry' | |
end | |
number_of_job_payload = ARGV[0].to_i || 100_000 | |
class Base | |
def initialize(where_conditions: {}, group_by: %w[job_class shop_id api_client_id target_hostname]) | |
@where_conditions = where_conditions | |
@group_by = group_by | |
@counts_per_attribute = Hash.new { |hash, key| hash[key] = Hash.new(0) } | |
end | |
def process_payloads(batches) | |
batches.each do |job_payloads| | |
process(job_payloads) | |
end | |
end | |
def process(job_payloads) | |
group_payloads_attributes_by_job(job_payloads) do |job_payload_attributes| | |
job_payload_attributes.each do |attribute_key, attribute_value| | |
sanitized_attribute_value = attribute_value.delete('"') | |
if @where_conditions[attribute_key] | |
break unless @where_conditions[attribute_key].include?(sanitized_attribute_value) | |
end | |
@counts_per_attribute[attribute_key][sanitized_attribute_value] += 1 | |
end | |
end | |
end | |
def group_payloads_attributes_by_job(job_payloads) | |
raise NotImplementedError | |
end | |
def pattern_to_extract_attributes | |
if @group_by.include?("shop_id") | |
group_by_dup = @group_by.clone | |
group_by_dup.delete("shop_id") | |
Regexp.union(shop_id_regex, attributes_regex(group_by_dup)) | |
else | |
attributes_regex(@group_by) | |
end | |
end | |
def shop_id_regex | |
/arguments":\[{\"(shop_id)\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
def attributes_regex(attributes) | |
/\"(#{Regexp.union(*attributes)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
class AnalyzerGlobalScan < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_payloads_attributes = batch_of_payloads.scan(pattern_to_extract_attributes).flatten.compact.each_slice(2).to_a | |
number_to_get_exact_payloads = batch_payloads_attributes.size / job_payloads.size | |
batch_payloads_attributes.each_slice(number_to_get_exact_payloads).to_a.each do |job_payload| | |
yield job_payload | |
end | |
end | |
end | |
class AnalyzerEachScan < Base | |
def group_payloads_attributes_by_job(payloads) | |
payloads.each do |payload| | |
scanned_results = payload.scan(pattern_to_extract_attributes) | |
# We need to flatten to being able to remove the duplicate shop_id | |
# attribute. | |
# Then we group them by two (attribute_key, attribute_value) | |
yield scanned_results.flatten.compact.each_slice(2).to_a | |
end | |
end | |
end | |
class AnalyzerGlobalScanCustomIteration < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes| | |
cleaned_attribute = attribute_key_value.compact | |
if cleaned_attribute.include?('job_class') | |
attributes = attributes << [] | |
attributes.last << cleaned_attribute | |
else | |
attributes.last << cleaned_attribute | |
end | |
end.each do |job_payload| | |
yield job_payload | |
end | |
end | |
end | |
class AnalyzerGlobalScanCustomIterationBasicRegex < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes| | |
# we group by job_class because we know each job payload | |
# have a job_class | |
if attribute_key_value.include?('job_class') | |
attributes = attributes << [] | |
attributes.last << attribute_key_value | |
else | |
# This will solve the issue with multiple shop_id | |
next if attributes.last.include?(attribute_key_value) | |
attributes.last << attribute_key_value | |
end | |
end.each do |job_payload| | |
yield job_payload | |
end | |
end | |
def pattern_to_extract_attributes | |
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
class AnalyzerSplitByJobClass < Base | |
def group_payloads_attributes_by_job(job_payloads) | |
batch_of_payloads = job_payloads.join(", ") | |
batch_of_payloads.scan(pattern_to_extract_attributes).join(' ').split(/(?=job_class)/).each do |attributes| | |
yield attributes.split.uniq.each_slice(2) | |
end | |
end | |
def pattern_to_extract_attributes | |
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ | |
end | |
end | |
JOB_CLASSES = %w[ | |
Appscale::Jobs::AnalyzerTest::WebhookQueueJob | |
Appscale::Jobs::AnalyzerTest::Whatever | |
Appscale::Jobs::AnalyzerTest::ILikeThisOne | |
Appscale::Jobs::AnalyzerTest::ImBackBaby | |
] | |
def generate_job_payload(job_class) | |
"{\"class\":\"#{job_class}\",\"args\":[{\"job_class\":\"#{job_class}\",\"job_id\":\"657a094f-d9ee-4f0a-92ea-bf8314482390\",\"provider_job_id\":null,\"queue_name\":\"webhook\",\"priority\":null,\"arguments\":[{\"shop_id\":690933842,\"_aj_symbol_keys\":[\"shop_id\"]}],\"executions\":0,\"locale\":\"en\",\"log_level\":0,\"attempt\":0,\"request_id\":null,\"queue_start\":1540041993.7910662,\"expected_run_time\":1540041993.791,\"pod_id\":0,\"privacy_level\":null,\"feature_set\":null,\"shop_id\":690933842,\"queued_by_shopify_version\":\"0aec3a435b6de9f4da41bc511e2257727f4cf6ef\",\"queued_by_section\":\"NilSectionGlobals\",\"queued_with_readonly_master\":false}]}" | |
end | |
def genarate_job_payloads(number_of_job_payloads) | |
[].tap do |array| | |
number_of_job_payloads.times do | |
array << generate_job_payload(JOB_CLASSES.sample) | |
end | |
end | |
end | |
batches = [ | |
genarate_job_payloads(number_of_job_payload), | |
genarate_job_payloads(number_of_job_payload), | |
] | |
global_scan = AnalyzerGlobalScan.new | |
global_scan_custom_iteration = AnalyzerGlobalScanCustomIteration.new | |
global_scan_custom_iteration_basic_regex = AnalyzerGlobalScanCustomIterationBasicRegex.new | |
split_by_job_class = AnalyzerSplitByJobClass.new | |
each_scan = AnalyzerEachScan.new | |
Benchmark.bmbm(28) do |x| | |
x.report('global_scan:') { global_scan.process_payloads(batches) } | |
x.report('global_scan_custom_iteration:') { global_scan_custom_iteration.process_payloads(batches) } | |
x.report('global_scan_custom_iteration_basic_regex:') { global_scan_custom_iteration_basic_regex.process_payloads(batches) } | |
x.report('split_by_job_class:') { split_by_job_class.process_payloads(batches) } | |
x.report('each_scan:') { each_scan.process_payloads(batches) } | |
end |
After talking with Moe, he pointed out that some method names are not very descriptive.
Also, know you can invoke the benchmark passing the number of job payloads that you want to execute.
ruby benchmark.rb 10000
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have done more experiments and looks like we can improve a little by using the basic regex
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
that was giving as some issues with multipleshop_id
attributes, but doing the aggregation with the custom iteration.global_scan_custom_iteration_basic_regex
is the winner 🎉 🎉