rhulse · April 30, 2011 01:08
diff --git a/recipe_import.rb b/recipe_import.rb
 # This code is for illustrative purposes only and should be read in conjunction
 # with this blog post:
 # http://richardhulse.blogspot.com/2011/04/rebuilding-radio-nz-part-4-content.html

 # I was still learning Rails and Ruby at the time this was written,
 # so it is a but rough and ready.

 # This code is released under an MIT license (the same as Rails).


 require 'rubygems'
 require 'nokogiri'

 namespace "import" do
  desc "Imports recipes from XML"
  task :recipes_from_xml => :environment do

    file_name = ENV['file'] || exit

    puts 'Reading XML'

    file = File.open(file_name)
    doc = Nokogiri::XML( file )
    file.close

    # check for errors
    doc.errors.each do |error|
      puts "ERROR on #{file_name}: #{error.to_s.strip}\n"
    end

    recipe_count = 0
    valid_count = 0
    chef = 0
    titles = 0

    Recipe.destroy_all

    doc.xpath('//recipes//recipe').each do |recipe_data|
      recipe_count +=1

      r = Nokogiri::XML( recipe_data.to_s )

      chef_found = false
      r.xpath('//body').each do |recipe|


        #puts "======================"
        html = tidy(recipe.content)

        # remove any divs
        html.gsub! /<div>/, ''
        html.gsub! /<\/div>/, ''
        html.gsub! /<div [^>]*>/, ''

        title = ''
        chef_name = ''
        programme = ''
        body = ''

        html.each_line do |line|
          case line
          when /<h2>/
            #puts "TITLE: #{line}"
            title = line.gsub!( /<(.|\n)*?>/, '')
            titles += 1
          when /Chef:|as heard on|recipe from|with Jim Mora|recipe by|Chef(.*)as heard/i
            line.gsub!( /<(.|\n)*?>/, '')

            case line
            when /Afternoon/i
              programme = 'Afternoons'
            when /Nine To Noon/i
              programme = 'Nine To Noon'
            when /Saturday/i
              programme = 'Saturday Morning'
            when /Country Life/i
              programme = 'Country Life'
            when /This Way Up/i
              programme = 'This Way Up'
            when /Summer Report/i
              programme = 'Summer Report'
            when /Nights/i
              programme = 'Nights'
            else
              programme = 'none'
            end

            if line =~ /Chef(:)?(.*)(as heard on|editor)?/
              chef_name = $2
              chef_name.gsub!( /,/, '')
              chef_name.gsub!( /&nbsp;/, '')

              chef_name =~ /(\w+) (\w+)/

              chef = chef + 1
              chef_found = true
            end
          else
            body << line
          end
        end
        date = DateTime.parse(r.xpath('//date').first.content) rescue nil
        # these are the valid ones to import
        if chef_found && (programme != 'none') && (! title.empty?)
          valid_count += 1
          # puts "CHEF: #{chef_name}"
          # puts "TITLE: #{title}"
          # puts "PROG: #{programme}"
          # puts "Date: #{date}"

          # import to ELF
          first_name, last_name = chef_name.split(' ')
          last_name.strip!
          last_name.gsub!( /'s$/, '' )
          last_name.gsub!( /s'$/, '' )
          title = CGI.unescapeHTML(title)
          recipe = Recipe.find_by_title(title)
          if recipe
            recipe.chefs.each {|chef_obj| chef_obj.destroy }
          end
          recipe ||= begin
            r = Recipe.new
            r.content ||= SupportingContent.new(:title => title.strip)
            r
          end
          person = Person.find_or_create_by_first_name_and_last_name(first_name, last_name)
          programme_obj = Programme.find_or_create_by_name(programme)
          recipe.content.attributes = {:body => body, :body_updated_at => date, :published_at => date}
          recipe.attributes = {:programme => programme_obj}
          recipe.broadcast_at = date
          recipe.chefs << person
          recipe.save! && recipe.content.save!
        else
          puts "MISSED!"
          puts "CHEF: #{chef_name}"
          puts "TITLE: #{title}"
          puts "PROG: #{programme}"
          puts "Date: #{date}"
        end

      end

    end

    # data output
    puts "Recipe count: #{recipe_count} "
    puts "Title count: #{titles} "
    puts "Parse success: #{valid_count} "

  end


  def tidy(data)
    cleaned = nil
    tidy = IO.popen('tidy -f "log/tidy.log" --output-xhtml 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -wrap 0 -utf8', 'w+')
    begin
        tidy.write(data)
        tidy.close_write
        cleaned = tidy.read
        tidy.close_read
    rescue Errno::EPIPE
        $stderr.print "Running 'tidy' failed: " + $!
        tidy.close
    end
    return cleaned if cleaned and cleaned != ""
    return data
  end
 end
	# This code is for illustrative purposes only and should be read in conjunction
	# with this blog post:
	# http://richardhulse.blogspot.com/2011/04/rebuilding-radio-nz-part-4-content.html

	# I was still learning Rails and Ruby at the time this was written,
	# so it is a but rough and ready.

	# This code is released under an MIT license (the same as Rails).


	require 'rubygems'
	require 'nokogiri'

	namespace "import" do
	desc "Imports recipes from XML"
	task :recipes_from_xml => :environment do

	file_name = ENV['file'] \|\| exit

	puts 'Reading XML'

	file = File.open(file_name)
	doc = Nokogiri::XML( file )
	file.close

	# check for errors
	doc.errors.each do \|error\|
	puts "ERROR on #{file_name}: #{error.to_s.strip}\n"
	end

	recipe_count = 0
	valid_count = 0
	chef = 0
	titles = 0

	Recipe.destroy_all

	doc.xpath('//recipes//recipe').each do \|recipe_data\|
	recipe_count +=1

	r = Nokogiri::XML( recipe_data.to_s )

	chef_found = false
	r.xpath('//body').each do \|recipe\|


	#puts "======================"
	html = tidy(recipe.content)

	# remove any divs
	html.gsub! /<div>/, ''
	html.gsub! /<\/div>/, ''
	html.gsub! /<div [^>]*>/, ''

	title = ''
	chef_name = ''
	programme = ''
	body = ''

	html.each_line do \|line\|
	case line
	when /<h2>/
	#puts "TITLE: #{line}"
	title = line.gsub!( /<(.\|\n)*?>/, '')
	titles += 1
	when /Chef:\|as heard on\|recipe from\|with Jim Mora\|recipe by\|Chef(.*)as heard/i
	line.gsub!( /<(.\|\n)*?>/, '')

	case line
	when /Afternoon/i
	programme = 'Afternoons'
	when /Nine To Noon/i
	programme = 'Nine To Noon'
	when /Saturday/i
	programme = 'Saturday Morning'
	when /Country Life/i
	programme = 'Country Life'
	when /This Way Up/i
	programme = 'This Way Up'
	when /Summer Report/i
	programme = 'Summer Report'
	when /Nights/i
	programme = 'Nights'
	else
	programme = 'none'
	end

	if line =~ /Chef(:)?(.*)(as heard on\|editor)?/
	chef_name = $2
	chef_name.gsub!( /,/, '')
	chef_name.gsub!( / /, '')

	chef_name =~ /(\w+) (\w+)/

	chef = chef + 1
	chef_found = true
	end
	else
	body << line
	end
	end
	date = DateTime.parse(r.xpath('//date').first.content) rescue nil
	# these are the valid ones to import
	if chef_found && (programme != 'none') && (! title.empty?)
	valid_count += 1
	# puts "CHEF: #{chef_name}"
	# puts "TITLE: #{title}"
	# puts "PROG: #{programme}"
	# puts "Date: #{date}"

	# import to ELF
	first_name, last_name = chef_name.split(' ')
	last_name.strip!
	last_name.gsub!( /'s$/, '' )
	last_name.gsub!( /s'$/, '' )
	title = CGI.unescapeHTML(title)
	recipe = Recipe.find_by_title(title)
	if recipe
	recipe.chefs.each {\|chef_obj\| chef_obj.destroy }
	end
	recipe \|\|= begin
	r = Recipe.new
	r.content \|\|= SupportingContent.new(:title => title.strip)
	r
	end
	person = Person.find_or_create_by_first_name_and_last_name(first_name, last_name)
	programme_obj = Programme.find_or_create_by_name(programme)
	recipe.content.attributes = {:body => body, :body_updated_at => date, :published_at => date}
	recipe.attributes = {:programme => programme_obj}
	recipe.broadcast_at = date
	recipe.chefs << person
	recipe.save! && recipe.content.save!
	else
	puts "MISSED!"
	puts "CHEF: #{chef_name}"
	puts "TITLE: #{title}"
	puts "PROG: #{programme}"
	puts "Date: #{date}"
	end

	end

	end

	# data output
	puts "Recipe count: #{recipe_count} "
	puts "Title count: #{titles} "
	puts "Parse success: #{valid_count} "

	end


	def tidy(data)
	cleaned = nil
	tidy = IO.popen('tidy -f "log/tidy.log" --output-xhtml 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -wrap 0 -utf8', 'w+')
	begin
	tidy.write(data)
	tidy.close_write
	cleaned = tidy.read
	tidy.close_read
	rescue Errno::EPIPE
	$stderr.print "Running 'tidy' failed: " + $!
	tidy.close
	end
	return cleaned if cleaned and cleaned != ""
	return data
	end
	end