Created
September 16, 2009 03:56
-
-
Save heycarsten/187854 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Some more pondering for my ScrapeKit normalizer builder. I keep throwing | |
# different use-cases at it which keeps causing me to change my design. I think | |
# I am starting to get to a happy place. | |
normalizer do |doc| | |
doc.collect :inventories, 'table[width="66%"] tr', :exclude => [0, -1, 'UNKNOWN QUANTITY'], :missing => :halt do |tr| | |
tr.all 'td[width="25%"]', :include => [1..3] do |(name, _, quantity)| | |
name.select(:name, :format => [:titlecase]) | |
quantity.select(:quantity, :format => [:to_i]) | |
end | |
end | |
doc.locate '.user-details', :missing => :halt, :include_if => :user_is_active do |div| | |
div.select '.name' do |content, node, doc| | |
first, last = *content.split | |
{ :first_name => first, :last_name => last } | |
end | |
div.select :age, '.age', :format => [/[0-9]+/, :to_i] | |
div.select :address, '.address', :format => [:strip] | |
div.select :country, '.country-code', :format => [:country_code_to_name] | |
end | |
doc.collect :attendees, 'ul.attendees li' do |li| | |
li.select do |content| | |
name, email = *content.split('|').map { |n| n.strip } | |
{ :name => name, :email => email } | |
end | |
end | |
doc.collect :job_histories, '.user-job-history ul li' do |li| | |
li.select :company, '.company-name' | |
li.select :title, '.job-title' | |
li.select '.job-timeframe', :helper => :job_dates | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment