Last active
July 27, 2017 19:24
-
-
Save billdueber/027b6f9c35623889bb2d42c091c6efa3 to your computer and use it in GitHub Desktop.
A self-contained (read: monkeypatch)benchmarking program for SolrEad based on https://github.com/awead/solr_ead/pull/20
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'benchmark' | |
require 'uri' | |
require 'solr_ead' | |
require 'concurrent' | |
# Make a subclass with all the speed patches | |
class IndexerWithPatches < SolrEad::Indexer | |
def additional_component_fields(node, addl_fields = Hash.new) | |
# Clear or create the cache | |
@cache = {} | |
p_ids = parent_id_list(node) | |
p_unittitles = parent_unittitle_list(node) | |
addl_fields["id"] = [eadid(node), node.attr("id")].join | |
addl_fields[Solrizer.solr_name("ead", :stored_sortable)] = eadid(node) | |
addl_fields[Solrizer.solr_name("parent", :stored_sortable)] = node.parent.attr("id") unless node.parent.attr("id").nil? | |
addl_fields[Solrizer.solr_name("parent", :displayable)] = p_ids | |
addl_fields[Solrizer.solr_name("parent_unittitles", :displayable)] = p_unittitles | |
addl_fields[Solrizer.solr_name("parent_unittitles", :searchable)] = p_unittitles | |
addl_fields[Solrizer.solr_name("component_level", :type => :integer)] = p_ids.length + 1 | |
addl_fields[Solrizer.solr_name("component_children", :type => :boolean)] = component_children?(node) | |
addl_fields[Solrizer.solr_name("collection", :facetable)] = collection(node) | |
addl_fields[Solrizer.solr_name("collection", :displayable)] = collection(node) | |
addl_fields[Solrizer.solr_name("repository", :facetable)] = repository(node) | |
addl_fields[Solrizer.solr_name("repository", :displayable)] = repository(node) | |
addl_fields | |
end | |
# can these be made to use absolute xpaths? | |
def repository(node) | |
@cache[:repo] ||= node.xpath("/ead/archdesc/did/repository").text.strip | |
end | |
def collection(node) | |
@cache[:collection] ||= node.xpath("/ead/archdesc/did/unittitle").text | |
end | |
def eadid(node) | |
@cache[:eadid] ||= node.xpath("/ead/eadheader/eadid").text | |
end | |
def parent_unittitle_list(node, results = ::Array.new) | |
while node.parent.name == "c" | |
parent = node.parent | |
results << get_title(parent) | |
node = parent | |
end | |
results.reverse | |
end | |
def get_title(node) | |
@memtitle ||= Hash.new {|h, node| h[node.object_id] = _get_title(node)} | |
@memtitle[node] | |
end | |
def _get_title(node) | |
title = node.at_xpath("./did/unittitle") | |
date = node.at_xpath("./did/unitdate") | |
if !title.nil? and !title.content.empty? | |
return ead_to_html(title.content) | |
elsif !date.nil? and !date.content.empty? | |
return ead_to_html(date.content) | |
else | |
return "[No title available]" | |
end | |
end | |
end | |
# Create a decent mimic of the processes needed to | |
# create a solr document and the component documents | |
def fake_solr_doc(indexer, filename) | |
components = indexer.components(filename) | |
# STDERR.puts "Working on #{filename} with #{components.size} components" | |
indexer.components(filename).each_with_index do |c, i| | |
acf = indexer.additional_component_fields(c) | |
end | |
end | |
# We're not actually talking to Solr, but need to set something | |
ENV["SOLR_URL"] = "http://not.gonna.happen/solr" | |
# Get filename(s) off the command line and benchmark them | |
filenames = ARGV | |
if filenames.empty? | |
puts "Need to pass along at least one filename of an EAD XML file to benchmark" | |
exit(1) | |
end | |
# Get a couple indexers | |
stock_indexer = SolrEad::Indexer.new | |
patched_indexer = IndexerWithPatches.new | |
def size_in_k(fname) | |
File.size(fname) / 1024.0 | |
end | |
stock_total = 0 | |
patched_total = 0 | |
stock_size = 0 | |
stock_comp = 0 | |
patched_size = 0 | |
patched_comp = 0 | |
TIMEOUT = 240 | |
$stdout.sync = true | |
puts <<"NOTES" | |
#{RUBY_DESCRIPTION} | |
NOTE: We give up after #{TIMEOUT} seconds | |
Skipping everything where stock had a sub-second processing time | |
NOTES | |
FORMAT = "%-25s %10.2f %7d %7s %7s %7s" | |
puts "%-25s %10s %7s %7s %7s %7s" % ["EAD File", "Size (KB)", "Comps", "Stock", "Patched", "Speedup"] | |
puts '-' * 68 | |
filenames.each do |fn| | |
name = File.basename(fn) | |
size = size_in_k(fn) | |
comp = stock_indexer.components(fn).size | |
stock_p = Concurrent::Promise.execute { Benchmark.realtime { doc = fake_solr_doc(stock_indexer, fn)} } | |
stock_v = stock_p.value(TIMEOUT) | |
stock = if stock_v | |
"%7.2f" % stock_v | |
else | |
"***" | |
end | |
patched_p = Concurrent::Promise.execute { Benchmark.realtime {doc = fake_solr_doc(patched_indexer, fn)} } | |
patched_v = patched_p.value(TIMEOUT) | |
next if stock_v and patched_v and stock_v < 1 and patched_v < 1 | |
patched = if patched_v | |
p = "%7.2f" % patched_v | |
else | |
" - " | |
end | |
mul = if stock_v and patched_p | |
"%3.0fx" % (stock_v / patched_v) | |
else | |
" - " | |
end | |
puts FORMAT % [name, size, comp, stock, patched, mul ] | |
if stock_v | |
stock_total += stock_v | |
stock_size += size | |
stock_comp += comp | |
end | |
if patched_v | |
patched_total += patched_v | |
patched_size += size | |
patched_comp += comp | |
end | |
end | |
puts | |
puts "%-25s %10.2f %7d %7.2f %7s" % ["Stock Totals", stock_size, stock_comp, stock_total, "--"] | |
puts "%-25s %10.2f %7d %7s %7.2f" % ["Patched Totals", patched_size, patched_comp, "--", patched_total] | |
puts | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment