Last active
August 29, 2015 14:16
-
-
Save billdueber/654cbbb30f40b1c4a50b to your computer and use it in GitHub Desktop.
Unrolling traject ToFieldStep loop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'library_stdnums' | |
require 'traject/macros/marc21_semantics' | |
extend Traject::Macros::Marc21Semantics | |
require 'traject/macros/marc_format_classifier' | |
extend Traject::Macros::MarcFormats | |
require 'traject/solr_json_writer' | |
require 'traject/marc_reader' | |
ATOZ = ('a'..'z').to_a.join('') | |
ATOU = ('a'..'u').to_a.join('') | |
to_field "id", extract_marc("001", :first => true) | |
to_field 'marc_display', serialized_marc(:format => :xml) | |
to_field "text", extract_all_marc_values do |r, acc| | |
acc.replace [acc.join(' ')] # turn it into a single string | |
end | |
to_field "language_facet", marc_languages("008[35-37]:041a:041d:") | |
to_field 'format', marc_formats | |
to_field "isbn_t", extract_marc('020a', :separator=>nil) do |rec, acc| | |
orig = acc.dup | |
acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)} | |
acc << orig | |
acc.flatten! | |
acc.uniq! | |
end | |
to_field 'material_type_display', extract_marc('300a', :trim_punctuation => true) | |
# Title fields | |
# primary title | |
to_field 'title_a', extract_marc('245a') | |
to_field 'title_display', extract_marc('245a', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'title_vern_display', extract_marc('245a', :trim_punctuation => true, :alternate_script=>:only) | |
# subtitle | |
to_field 'subtitle_t', extract_marc('245b') | |
to_field 'subtitle_display', extract_marc('245b', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'subtitle_vern_display', extract_marc('245b', :trim_punctuation => true, :alternate_script=>:only) | |
# additional title fields | |
to_field 'title_addl_t', | |
extract_marc(%W{ | |
245abnps | |
130#{ATOZ} | |
240abcdefgklmnopqrs | |
210ab | |
222ab | |
242abnp | |
243abcdefgklmnopqrs | |
246abcdefgnp | |
247abcdefgnp | |
}) | |
to_field 'title_added_entry_t', extract_marc(%W{ | |
700gklmnoprst | |
710fgklmnopqrst | |
711fgklnpst | |
730abcdefgklmnopqrst | |
740anp | |
}) | |
to_field 'title_series_t', extract_marc("440anpv:490av") | |
to_field 'title_sort', marc_sortable_title | |
# Author fields | |
to_field 'author_t', extract_marc("100abcegqu:110abcdegnu:111acdegjnqu") | |
to_field 'author_addl_t', extract_marc("700abcegqu:710abcdegnu:711acdegjnqu") | |
to_field 'author_display', extract_marc("100abcdq:110#{ATOZ}:111#{ATOZ}", :alternate_script=>false) | |
to_field 'author_vern_display', extract_marc("100abcdq:110#{ATOZ}:111#{ATOZ}", :alternate_script=>:only) | |
# JSTOR isn't an author. Try to not use it as one | |
to_field 'author_sort', marc_sortable_author | |
# Subject fields | |
to_field 'subject_t', extract_marc(%W( | |
600#{ATOU} | |
610#{ATOU} | |
611#{ATOU} | |
630#{ATOU} | |
650abcde | |
651ae | |
653a:654abcde:655abc | |
)) | |
to_field 'subject_addl_t', extract_marc("600vwxyz:610vwxyz:611vwxyz:630vwxyz:650vwxyz:651vwxyz:654vwxyz:655vwxyz") | |
to_field 'subject_topic_facet', extract_marc("600abcdq:610ab:611ab:630aa:650aa:653aa:654ab:655ab", :trim_punctuation => true) | |
to_field 'subject_era_facet', extract_marc("650y:651y:654y:655y", :trim_punctuation => true) | |
to_field 'subject_geo_facet', extract_marc("651a:650z",:trim_punctuation => true ) | |
# Publication fields | |
to_field 'published_display', extract_marc('260a', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'published_vern_display', extract_marc('260a', :trim_punctuation => true, :alternate_script=>:only) | |
to_field 'pub_date', marc_publication_date | |
# Call Number fields | |
to_field 'lc_callnum_display', extract_marc('050ab', :first => true) | |
to_field 'lc_1letter_facet', extract_marc('050ab', :first=>true, :translation_map=>'callnumber_map') do |rec, acc| | |
# Just get the first letter to send to the translation map | |
acc.map!{|x| x[0]} | |
end | |
alpha_pat = /\A([A-Z]{1,3})\d.*\Z/ | |
to_field 'lc_alpha_facet', extract_marc('050a', :first=>true) do |rec, acc| | |
acc.map! do |x| | |
(m = alpha_pat.match(x)) ? m[1] : nil | |
end | |
acc.compact! # eliminate nils | |
end | |
to_field 'lc_b4cutter_facet', extract_marc('050a', :first=>true) | |
# URL Fields | |
notfulltext = /abstract|description|sample text|table of contents|/i | |
to_field('url_fulltext_display') do |rec, acc| | |
rec.fields('856').each do |f| | |
case f.indicator2 | |
when '0' | |
f.find_all{|sf| sf.code == 'u'}.each do |url| | |
acc << url.value | |
end | |
when '2' | |
# do nothing | |
else | |
z3 = [f['z'], f['3']].join(' ') | |
unless notfulltext.match(z3) | |
acc << f['u'] unless f['u'].nil? | |
end | |
end | |
end | |
end | |
# Very similar to url_fulltext_display. Should DRY up. | |
to_field 'url_suppl_display' do |rec, acc| | |
rec.fields('856').each do |f| | |
case f.indicator2 | |
when '2' | |
f.find_all{|sf| sf.code == 'u'}.each do |url| | |
acc << url.value | |
end | |
when '0' | |
# do nothing | |
else | |
z3 = [f['z'], f['3']].join(' ') | |
if notfulltext.match(z3) | |
acc << f['u'] unless f['u'].nil? | |
end | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'benchmark' | |
require 'benchmark/ips' | |
require 'marc' | |
require 'json' | |
$:.unshift '../lib' | |
$:.unshift '.' | |
require 'traject' | |
# Get some marc records to work with | |
records = File.open('100.json').each_with_object([]) {|line, recs| recs << MARC::Record.new_from_hash(JSON.parse(line))} | |
class Traject::Indexer::ToFieldStepNew < Traject::Indexer::ToFieldStep | |
def execute(context) | |
accumulator = [] | |
if @lambda | |
if @lambda.arity == 2 | |
@lambda.call(context.source_record, accumulator) | |
else | |
@lambda.call(context.source_record, accumulator, context) | |
end | |
end | |
if @block | |
if @block.arity == 2 | |
@block.call(context.source_record, accumulator) | |
else | |
@block.call(context.source_record, accumulator, context) | |
end | |
end | |
return accumulator | |
end | |
end | |
class Traject::IndexerNew < Traject::Indexer | |
def to_field(field_name, aLambda = nil, &block) | |
@index_steps << ToFieldStepNew.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) ) | |
end | |
end | |
old_indexer = Traject::Indexer.new({}) | |
old_indexer.instance_eval(File.open('test_index.rb').read) | |
new_indexer = Traject::IndexerNew.new({}) | |
new_indexer.instance_eval(File.open('test_index.rb').read) | |
Benchmark.ips do |x| | |
x.config(:time => 25, :warmup => 5) | |
x.report("Current impl") do | |
records.each {|r| old_indexer.map_record(r)} | |
end | |
x.report("Unroll") do | |
records.each {|r| new_indexer.map_record(r)} | |
end | |
x.compare! | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment