Last active
August 29, 2015 14:14
-
-
Save billdueber/e4b6787fe2b745183a50 to your computer and use it in GitHub Desktop.
traject translation of blacklight solrmarc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gem 'traject', '2.0.pre' # from the dev-2.0 branch | |
$:.unshift '.' | |
require 'library_stdnums' | |
require 'traject/macros/marc21_semantics' | |
extend Traject::Macros::Marc21Semantics | |
require 'traject/macros/marc_format_classifier' | |
extend Traject::Macros::MarcFormats | |
require 'traject/solr_json_writer' | |
require 'traject/marc_reader' | |
settings do | |
provide "reader_class_name", "Traject::MarcReader" | |
provide "marc_source.type", "binary" | |
provide "solr.url", ENV["SOLR_URL"] | |
provide "solr_writer.commit_on_close", "true" | |
provide "solr_writer.thread_pool", 1 | |
provide "solr_writer.batch_size", 100 | |
provide "writer_class_name", "Traject::SolrJsonWriter" | |
provide 'processing_thread_pool', 1 | |
provide "log.batch_size", 10_000 | |
end | |
ATOZ = ('a'..'z').to_a.join('') | |
ATOU = ('a'..'u').to_a.join('') | |
to_field "id", extract_marc("001", :first => true) | |
to_field 'marc_display', serialized_marc(:format => :xml) | |
to_field "text", extract_all_marc_values do |r, acc| | |
acc.replace [acc.join(' ')] # turn it into a single string | |
end | |
to_field "language_facet", marc_languages("008[35-37]:041a:041d:") | |
to_field 'format', marc_formats | |
to_field "isbn_t", extract_marc('020a', :separator=>nil) do |rec, acc| | |
orig = acc.dup | |
acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)} | |
acc << orig | |
acc.flatten! | |
acc.uniq! | |
end | |
to_field 'material_type_display', extract_marc('300a', :trim_punctuation => true) | |
# Title fields | |
# primary title | |
to_field 'title_a', extract_marc('245a') | |
to_field 'title_display', extract_marc('245a', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'title_vern_display', extract_marc('245a', :trim_punctuation => true, :alternate_script=>:only) | |
# subtitle | |
to_field 'subtitle_t', extract_marc('245b') | |
to_field 'subtitle_display', extract_marc('245b', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'subtitle_vern_display', extract_marc('245b', :trim_punctuation => true, :alternate_script=>:only) | |
# additional title fields | |
to_field 'title_addl_t', | |
extract_marc(%W{ | |
245abnps | |
130#{ATOZ} | |
240abcdefgklmnopqrs | |
210ab | |
222ab | |
242abnp | |
243abcdefgklmnopqrs | |
246abcdefgnp | |
247abcdefgnp | |
}) | |
to_field 'title_added_entry_t', extract_marc(%W{ | |
700gklmnoprst | |
710fgklmnopqrst | |
711fgklnpst | |
730abcdefgklmnopqrst | |
740anp | |
}) | |
to_field 'title_series_t', extract_marc("440anpv:490av") | |
to_field 'title_sort', marc_sortable_title | |
# Author fields | |
to_field 'author_t', extract_marc("100abcegqu:110abcdegnu:111acdegjnqu") | |
to_field 'author_addl_t', extract_marc("700abcegqu:710abcdegnu:711acdegjnqu") | |
to_field 'author_display', extract_marc("100abcdq:110#{ATOZ}:111#{ATOZ}", :alternate_script=>false) | |
to_field 'author_vern_display', extract_marc("100abcdq:110#{ATOZ}:111#{ATOZ}", :alternate_script=>:only) | |
# JSTOR isn't an author. Try to not use it as one | |
to_field 'author_sort', marc_sortable_author | |
# Subject fields | |
to_field 'subject_t', extract_marc(%W( | |
600#{ATOU} | |
610#{ATOU} | |
611#{ATOU} | |
630#{ATOU} | |
650abcde | |
651ae | |
653a:654abcde:655abc | |
)) | |
to_field 'subject_addl_t', extract_marc("600vwxyz:610vwxyz:611vwxyz:630vwxyz:650vwxyz:651vwxyz:654vwxyz:655vwxyz") | |
to_field 'subject_topic_facet', extract_marc("600abcdq:610ab:611ab:630aa:650aa:653aa:654ab:655ab", :trim_punctuation => true) | |
to_field 'subject_era_facet', extract_marc("650y:651y:654y:655y", :trim_punctuation => true) | |
to_field 'subject_geo_facet', extract_marc("651a:650z",:trim_punctuation => true ) | |
# Publication fields | |
to_field 'published_display', extract_marc('260a', :trim_punctuation => true, :alternate_script=>false) | |
to_field 'published_vern_display', extract_marc('260a', :trim_punctuation => true, :alternate_script=>:only) | |
to_field 'pub_date', marc_publication_date | |
# Call Number fields | |
to_field 'lc_callnum_display', extract_marc('050ab', :first => true) | |
to_field 'lc_1letter_facet', extract_marc('050ab', :first=>true, :translation_map=>'callnumber_map') do |rec, acc| | |
# Just get the first letter to send to the translation map | |
acc.map!{|x| x[0]} | |
end | |
alpha_pat = /\A([A-Z]{1,3})\d.*\Z/ | |
to_field 'lc_alpha_facet', extract_marc('050a', :first=>true) do |rec, acc| | |
acc.map! do |x| | |
(m = alpha_pat.match(x)) ? m[1] : nil | |
end | |
acc.compact! # eliminate nils | |
end | |
to_field 'lc_b4cutter_facet', extract_marc('050a', :first=>true) | |
# URL Fields | |
notfulltext = /abstract|description|sample text|table of contents|/i | |
to_field('url_fulltext_display') do |rec, acc| | |
rec.fields('856').each do |f| | |
case f.indicator2 | |
when '0' | |
f.find_all{|sf| sf.code == 'u'}.each do |url| | |
acc << url.value | |
end | |
when '2' | |
# do nothing | |
else | |
z3 = [f['z'], f['3']].join(' ') | |
unless notfulltext.match(z3) | |
acc << f['u'] unless f['u'].nil? | |
end | |
end | |
end | |
end | |
# Very similar to url_fulltext_display. Should DRY up. | |
to_field 'url_suppl_display' do |rec, acc| | |
rec.fields('856').each do |f| | |
case f.indicator2 | |
when '2' | |
f.find_all{|sf| sf.code == 'u'}.each do |url| | |
acc << url.value | |
end | |
when '0' | |
# do nothing | |
else | |
z3 = [f['z'], f['3']].join(' ') | |
if notfulltext.match(z3) | |
acc << f['u'] unless f['u'].nil? | |
end | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment