-
-
Save daveaseeman/acfc67ca4c2be60f979c to your computer and use it in GitHub Desktop.
web crawling example in ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%W[rubygems anemone].each {|r| require r} | |
site_root = "url" | |
# Create the root folder | |
folder = URI.parse(site_root).host | |
FileUtils.mkdir_p(File.join(".",folder)) | |
Anemone.crawl(site_root) do |anemone| | |
anemone.on_every_page do |page| | |
filename = page.url.request_uri.to_s | |
filename = "/index.html" if filename == "/" # Make sure the file name is valid | |
folders = filename.split("/") | |
filename = folders.pop | |
FileUtils.mkdir_p(File.join(".",folder,folders)) # Create the current subfolder | |
print "Downloading '#{page.url}'..." | |
File.open(File.join(".",folder,folders,filename),"w") {|f| f.write(page.body)} | |
puts "done." | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment