Created
May 16, 2013 13:58
-
-
Save mathildathompson/5591922 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#nokogiri callbacks | |
require_relative 'wiki_handler' | |
class WikiImport < Nokogiri::XML::SAX::Document | |
include WikiHandler | |
# A stack (Array) of attributes as we find them | |
attr_accessor :attribute_stack | |
# A logger to output to the screen | |
attr_accessor :logger | |
# A counter to increment each time you find a page | |
attr_accessor :page_count | |
# The output SQL file | |
attr_accessor :sql | |
# The contents of the last page as a hash | |
attr_accessor :last_page | |
# The text contents of last element's body | |
attr_accessor :last_body | |
attr_accessor :sql | |
def initialize(logger) | |
self.logger = logger | |
self.attribute_stack = Array.new | |
self.page_count = 0 | |
self.last_page = {} | |
self.last_body = "" | |
@output_file_count = 0 | |
@title = "" | |
@text = "" | |
end | |
def start_document | |
end | |
def end_document | |
# logger.debug "End document" | |
end | |
def start_element(name, attrs) | |
# logger.debug "Found element #{name}" | |
case name | |
when "title" | |
@interestedtitle = true | |
when "text" | |
@interestedtext = true | |
else | |
@interestedtitle = false | |
@interestedtext = false | |
end | |
# logger.debug "Found element #{name}" | |
end | |
def characters(c) | |
@title = @title + c if @interestedtitle | |
#@title += c if interestedtitle | |
@text = @text + c if @interestedtext | |
end | |
def end_element(name) | |
#say we have finished the page and then print out the title and body | |
if name == 'page' | |
sql = File.open('data/my_stuff.sql', 'w') | |
sql << "INSERT INTO articles(title, body) VALUES('#{@title}', '#{@text}');" | |
sql.close | |
puts @title | |
puts @text | |
# logger.debug @text | |
@title = "" | |
@text = "" | |
end | |
end | |
def method_missing(m, *args, &block) | |
# logger.debug("Ignoring #{m}") | |
end | |
protected | |
def handler_method(name) | |
:"handle_#{name.downcase}" | |
end | |
def clean(s) | |
s.strip.gsub("'", "''") | |
#this has the effect of taking out the apostrohies | |
end | |
def output_file_name | |
"/tmp/articles-#{@output_file_count}.sql" | |
end | |
# results [] | |
# page = doc.css('page').text | |
# page = doc.css('title').text | |
# page = doc.css('body').text | |
# results << page | |
# results << title | |
# results << body | |
# binindg.pry | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment