Created
November 20, 2012 13:43
-
-
Save vindia/4118003 to your computer and use it in GitHub Desktop.
Simple script for retrieving URLs out of generated sitemaps as used by Google Analytics and the like.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
def add_urls_to_buffer! | |
@xml.css('sitemap loc').each{ |node| @buffer << node.text } | |
end | |
def save_urls_to_file! | |
@xml.css('url loc').each do |url| | |
puts "Saving #{url.text}" | |
File.open('all_urls_in_sitemap', 'a') {|f| f.write url.text << "\n"} | |
end | |
end | |
def process_sitemap(url) | |
document = nil | |
begin | |
document = open url | |
rescue | |
puts "Invalid sitemap: #{url}" | |
end | |
unless document.nil? | |
puts ">>> Processing #{url}" | |
@xml = Nokogiri::XML document | |
case @xml.root.name.to_sym | |
when :sitemapindex then add_urls_to_buffer! | |
when :urlset then save_urls_to_file! | |
else | |
puts "Sorry, no sitemap found" | |
end | |
end | |
end | |
# Process the sitemaps | |
sitemap = ARGV[0] || 'http://www.springest.nl/sitemap-links-nl.xml' | |
@buffer = [sitemap] | |
if @buffer.any? | |
@buffer.each do |url| | |
process_sitemap url | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment