Created
April 30, 2011 01:08
-
-
Save rhulse/949309 to your computer and use it in GitHub Desktop.
Recipe importer for ELF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is for illustrative purposes only and should be read in conjunction | |
# with this blog post: | |
# http://richardhulse.blogspot.com/2011/04/rebuilding-radio-nz-part-4-content.html | |
# I was still learning Rails and Ruby at the time this was written, | |
# so it is a but rough and ready. | |
# This code is released under an MIT license (the same as Rails). | |
require 'rubygems' | |
require 'nokogiri' | |
namespace "import" do | |
desc "Imports recipes from XML" | |
task :recipes_from_xml => :environment do | |
file_name = ENV['file'] || exit | |
puts 'Reading XML' | |
file = File.open(file_name) | |
doc = Nokogiri::XML( file ) | |
file.close | |
# check for errors | |
doc.errors.each do |error| | |
puts "ERROR on #{file_name}: #{error.to_s.strip}\n" | |
end | |
recipe_count = 0 | |
valid_count = 0 | |
chef = 0 | |
titles = 0 | |
Recipe.destroy_all | |
doc.xpath('//recipes//recipe').each do |recipe_data| | |
recipe_count +=1 | |
r = Nokogiri::XML( recipe_data.to_s ) | |
chef_found = false | |
r.xpath('//body').each do |recipe| | |
#puts "======================" | |
html = tidy(recipe.content) | |
# remove any divs | |
html.gsub! /<div>/, '' | |
html.gsub! /<\/div>/, '' | |
html.gsub! /<div [^>]*>/, '' | |
title = '' | |
chef_name = '' | |
programme = '' | |
body = '' | |
html.each_line do |line| | |
case line | |
when /<h2>/ | |
#puts "TITLE: #{line}" | |
title = line.gsub!( /<(.|\n)*?>/, '') | |
titles += 1 | |
when /Chef:|as heard on|recipe from|with Jim Mora|recipe by|Chef(.*)as heard/i | |
line.gsub!( /<(.|\n)*?>/, '') | |
case line | |
when /Afternoon/i | |
programme = 'Afternoons' | |
when /Nine To Noon/i | |
programme = 'Nine To Noon' | |
when /Saturday/i | |
programme = 'Saturday Morning' | |
when /Country Life/i | |
programme = 'Country Life' | |
when /This Way Up/i | |
programme = 'This Way Up' | |
when /Summer Report/i | |
programme = 'Summer Report' | |
when /Nights/i | |
programme = 'Nights' | |
else | |
programme = 'none' | |
end | |
if line =~ /Chef(:)?(.*)(as heard on|editor)?/ | |
chef_name = $2 | |
chef_name.gsub!( /,/, '') | |
chef_name.gsub!( / /, '') | |
chef_name =~ /(\w+) (\w+)/ | |
chef = chef + 1 | |
chef_found = true | |
end | |
else | |
body << line | |
end | |
end | |
date = DateTime.parse(r.xpath('//date').first.content) rescue nil | |
# these are the valid ones to import | |
if chef_found && (programme != 'none') && (! title.empty?) | |
valid_count += 1 | |
# puts "CHEF: #{chef_name}" | |
# puts "TITLE: #{title}" | |
# puts "PROG: #{programme}" | |
# puts "Date: #{date}" | |
# import to ELF | |
first_name, last_name = chef_name.split(' ') | |
last_name.strip! | |
last_name.gsub!( /'s$/, '' ) | |
last_name.gsub!( /s'$/, '' ) | |
title = CGI.unescapeHTML(title) | |
recipe = Recipe.find_by_title(title) | |
if recipe | |
recipe.chefs.each {|chef_obj| chef_obj.destroy } | |
end | |
recipe ||= begin | |
r = Recipe.new | |
r.content ||= SupportingContent.new(:title => title.strip) | |
r | |
end | |
person = Person.find_or_create_by_first_name_and_last_name(first_name, last_name) | |
programme_obj = Programme.find_or_create_by_name(programme) | |
recipe.content.attributes = {:body => body, :body_updated_at => date, :published_at => date} | |
recipe.attributes = {:programme => programme_obj} | |
recipe.broadcast_at = date | |
recipe.chefs << person | |
recipe.save! && recipe.content.save! | |
else | |
puts "MISSED!" | |
puts "CHEF: #{chef_name}" | |
puts "TITLE: #{title}" | |
puts "PROG: #{programme}" | |
puts "Date: #{date}" | |
end | |
end | |
end | |
# data output | |
puts "Recipe count: #{recipe_count} " | |
puts "Title count: #{titles} " | |
puts "Parse success: #{valid_count} " | |
end | |
def tidy(data) | |
cleaned = nil | |
tidy = IO.popen('tidy -f "log/tidy.log" --output-xhtml 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -wrap 0 -utf8', 'w+') | |
begin | |
tidy.write(data) | |
tidy.close_write | |
cleaned = tidy.read | |
tidy.close_read | |
rescue Errno::EPIPE | |
$stderr.print "Running 'tidy' failed: " + $! | |
tidy.close | |
end | |
return cleaned if cleaned and cleaned != "" | |
return data | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment