Skip to content

Instantly share code, notes, and snippets.

@mathildathompson
Created May 16, 2013 13:58
Show Gist options
  • Save mathildathompson/5591922 to your computer and use it in GitHub Desktop.
Save mathildathompson/5591922 to your computer and use it in GitHub Desktop.
#nokogiri callbacks
require_relative 'wiki_handler'
class WikiImport < Nokogiri::XML::SAX::Document
include WikiHandler
# A stack (Array) of attributes as we find them
attr_accessor :attribute_stack
# A logger to output to the screen
attr_accessor :logger
# A counter to increment each time you find a page
attr_accessor :page_count
# The output SQL file
attr_accessor :sql
# The contents of the last page as a hash
attr_accessor :last_page
# The text contents of last element's body
attr_accessor :last_body
attr_accessor :sql
def initialize(logger)
self.logger = logger
self.attribute_stack = Array.new
self.page_count = 0
self.last_page = {}
self.last_body = ""
@output_file_count = 0
@title = ""
@text = ""
end
def start_document
end
def end_document
# logger.debug "End document"
end
def start_element(name, attrs)
# logger.debug "Found element #{name}"
case name
when "title"
@interestedtitle = true
when "text"
@interestedtext = true
else
@interestedtitle = false
@interestedtext = false
end
# logger.debug "Found element #{name}"
end
def characters(c)
@title = @title + c if @interestedtitle
#@title += c if interestedtitle
@text = @text + c if @interestedtext
end
def end_element(name)
#say we have finished the page and then print out the title and body
if name == 'page'
sql = File.open('data/my_stuff.sql', 'w')
sql << "INSERT INTO articles(title, body) VALUES('#{@title}', '#{@text}');"
sql.close
puts @title
puts @text
# logger.debug @text
@title = ""
@text = ""
end
end
def method_missing(m, *args, &block)
# logger.debug("Ignoring #{m}")
end
protected
def handler_method(name)
:"handle_#{name.downcase}"
end
def clean(s)
s.strip.gsub("'", "''")
#this has the effect of taking out the apostrohies
end
def output_file_name
"/tmp/articles-#{@output_file_count}.sql"
end
# results []
# page = doc.css('page').text
# page = doc.css('title').text
# page = doc.css('body').text
# results << page
# results << title
# results << body
# binindg.pry
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment