Skip to content

Instantly share code, notes, and snippets.

@kardeiz
Created September 10, 2012 15:44
Show Gist options
  • Save kardeiz/3691648 to your computer and use it in GitHub Desktop.
Save kardeiz/3691648 to your computer and use it in GitHub Desktop.
get source, etc. from DigiTool objects. Check against CSV and update matches
#!/usr/bin/env ruby
require 'nokogiri'
require 'csv'
def get_stuff_from_dc(file)
ret_values = {}
my_xml = Nokogiri::XML(File.read(file))
# I know these rescue nils are bad but I don't know a more efficient way
ret_values[:pid] = my_xml.at_xpath('//xb:digital_entity/pid/text()', "xb" => "http://com/exlibris/digitool/repository/api/xmlbeans").content rescue nil
dc_source_raw = Nokogiri::XML(my_xml.at_xpath("//md[type='dc']/value").content).xpath("//dc:source/text()","dc" => "http://purl.org/dc/elements/1.1/") rescue nil
ret_values[:source] = dc_source_raw.map(&:content).select{|z| z.match(/^Photo\sNumber:\s/) }.map{|z| z.gsub(/^Photo\sNumber:\s/,"") } rescue nil
return ret_values
end
f = nil; File.open('/home/jhbrown/shared_with_fedora/photodata.csv') do |file|
f = CSV.parse(file.read.encode('UTF-8', :invalid => :replace)) # , :headers => true, :header_converters => :symbol)
end
my_files = Dir.chdir('/home/jhbrown/shared_with_fedora/digital_entities') { Dir.glob("./*").map{|x| File.expand_path(x) } }
# my_files = ['/home/jhbrown/shared_with_fedora/digital_entities/87688.xml']
my_files_parsed = my_files.map{|x| get_stuff_from_dc(x) }
csv_arr_1 = []
f.each do |x|
my_pn = x[6] # x[:photo_number]
pid_array = []
my_files_parsed.each do |y|
temp = y[:source].include?(my_pn) ? y[:pid] : nil rescue nil
pid_array << temp unless temp.nil?
end
pid_array.empty? ? csv_arr_1 << (x << nil) : csv_arr_1 << (x << pid_array.join("|"))
end
CSV.open("temp.csv", "w") do |csv|
csv_arr_1.each do |i|
csv << i
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment