-
-
Save kmile/827475 to your computer and use it in GitHub Desktop.
# A small DSL for helping parsing documents using Nokogiri::XML::Reader. The | |
# XML Reader is a good way to move a cursor through a (large) XML document fast, | |
# but is not as cumbersome as writing a full SAX document handler. Read about | |
# it here: http://nokogiri.org/Nokogiri/XML/Reader.html | |
# | |
# Just pass the reader in this parser and specificy the nodes that you are interested | |
# in in a block. You can just parse every node or only look inside certain nodes. | |
# | |
# A small example: | |
# | |
# Xml::Parser.new(Nokogiri::XML::Reader(open(file))) do | |
# inside_element 'User' do | |
# for_element 'Name' do puts "Username: #{inner_xml}" end | |
# for_element 'Email' do puts "Email: #{inner_xml}" end | |
# | |
# for_element 'Address' do | |
# puts 'Start of address:' | |
# inside_element do | |
# for_element 'Street' do puts "Street: #{inner_xml}" end | |
# for_element 'Zipcode' do puts "Zipcode: #{inner_xml}" end | |
# for_element 'City' do puts "City: #{inner_xml}" end | |
# end | |
# puts 'End of address' | |
# end | |
# end | |
# end | |
# | |
# It does NOT fail on missing tags, and does not guarantee order of execution. It parses | |
# every tag regardless of nesting. The only way to guarantee scope is by using | |
# the `inside_element` method. This limits the parsing to the current or the named tag. | |
# If tags are encountered multiple times, their blocks will be called multiple times. | |
require 'nokogiri' | |
module Xml | |
class Parser | |
def initialize(node, &block) | |
@node = node | |
@node.each do | |
self.instance_eval &block | |
end | |
end | |
def name | |
@node.name | |
end | |
def inner_xml | |
@node.inner_xml.strip | |
end | |
def is_start? | |
@node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT | |
end | |
def is_end? | |
@node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT | |
end | |
def attribute(attribute) | |
@node.attribute(attribute) | |
end | |
def for_element(name, &block) | |
return unless self.name == name and is_start? | |
self.instance_eval &block | |
end | |
def inside_element(name=nil, &block) | |
return if @node.self_closing? | |
return unless name.nil? or (self.name == name and is_start?) | |
name = @node.name | |
depth = @node.depth | |
@node.each do | |
return if self.name == name and is_end? and @node.depth == depth | |
self.instance_eval &block | |
end | |
end | |
end | |
end |
That's a really nice and useful gist.
Thanks!
Very nice code! Thanks a lot!
Is there any more documentation or examples on how to use this anywhere? I'm having a hard time instantiating classes inside for_element. Keep getting NoMethodErrors...
This really is fantastic - excellent work!
I just woke up in the middle of the night envisioning something like this.
And it already exists.
Good work.
OMG! This is awesome!!!! +1
inner_xml doesn't seem to unescape & -- what's the recommended way to do this?
Is anybody can help me i have xml file which is 1gb i need find some category and import 100 product from 1gb xml file
here is my code
in controller
def import
if params[:xml_file]
file = params[:xml_file]
doc = Nokogiri::XML::Document.parse(file)
total_product = doc.xpath('//shop/offers/offer').take(2).length
Product.import(doc, params[:category_id])
redirect_to products_path, notice: "#{total_product} Product added."
end
end
and in product model
def self.import(doc, category)
parsed_products = doc.xpath('//shop/offers/offer').take(2)
if !self.fashion.nil?
self.transaction do
parsed_products.each do |product|
if product.at_xpath('categoryId').text == category
Product.create!(
price: product.at_xpath('price').text,
category_id: product.at_xpath('categoryId').text,
remote_image_url: product.at_xpath('picture').text.strip,
brand_id: product.at_xpath('vendor').text,
title: product.at_xpath('name').text,
description: product.at_xpath('description').text,
gender: product.at_xpath('fashion/gender').present? ? product.at_xpath('fashion/gender').text.gsub("m","Male").gsub("f","Female") : nil,
product_type: product.at_xpath('fashion/type').present? ? product.at_xpath('fashion/type').text : '',
)
end
end
end
end
end
form
h2.text-center Import Products
= form_tag import_products_path, multipart: true do |f|
= file_field_tag :xml_file
br
br
br
= submit_tag "Import"
any advice will be appreciated thanks advance
Had a 60+GB xml on my hands - and until @kmile showed me the path I was utterly lost in XML up above my ears :)
Thank you - from the bottom of my ❤️
This is beautiful and saved me so much time and pain. Thank you @kmile.
Thanks a lot for this wonderful piece of code. Did anyone get it to work with JRuby?
@kmile this is awesome! Is there a way to prevent the text coming back with CDATA wrappers?
<![CDATA[My Text]]>
Still rocking the house in 2018 !
License?
It's 4 years after the last comment. And still this is useful. Thank you.
This appears to be from libxml2. It seems that it references the
XmlTextReaderMode
enum:But I cannot tell for sure without looking at the source what these states/modes mean, or if this is a complete list.