Created
December 14, 2011 10:25
-
-
Save clooth/1476043 to your computer and use it in GitHub Desktop.
Spider for Learn Ruby The Hard Way html book
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Learn Ruby The Hard Way | |
# ======================= | |
# This tool will download you an offline | |
# single-page html version of the book | |
# ======================= | |
# NOTE: This was only does to test how well it performs. | |
# You should read the book on the site: | |
# http://ruby.learncodethehardway.org/book/ | |
# | |
require 'curb' | |
require 'nokogiri' | |
# Methods for fetching pages | |
def load_urls urls | |
responses = {} | |
m = Curl::Multi.new | |
urls.each do |url| | |
responses[url] = "" | |
c = Curl::Easy.new(url) do|curl| | |
curl.follow_location = true | |
curl.on_body do |data| | |
responses[url] << data | |
data.size | |
end | |
end | |
m.add(c) | |
end | |
m.perform | |
responses | |
end | |
# Root url for the book | |
root_url = "http://ruby.learncodethehardway.org/book/" | |
# Chapter links | |
chapter_urls = [] | |
# Fetch the main book index and get the links for each of the chapters | |
main_page = load_urls([root_url]) | |
main_page = Nokogiri::HTML(main_page[root_url]) | |
main_page.css('.toctree-l1 a').each do |chapter_link| | |
chapter_urls << root_url + chapter_link.attributes["href"].content | |
end | |
# Final page markup | |
final_markup = <<FOOLOL | |
<html> | |
<head> | |
<title>Learn Ruby The Hard Way</title> | |
</head> | |
<body> | |
<h1>Learn Ruby The Hard Way Book</h1> | |
<h2>Table of Contents</h2> | |
<ul>%s</ul> | |
%s | |
</body> | |
</html> | |
FOOLOL | |
# Fetch all the chapters | |
chapter_pages = load_urls(chapter_urls) | |
# Build table of contents | |
table_of_contents = "" | |
chapter_pages.each do |key, chapter_page| | |
page = Nokogiri::HTML(chapter_page) | |
slug = page.css('.section').first.attributes["id"].content | |
title = page.css('.section h1').first.text.chop | |
table_of_contents << "<li><a href=\"\##{slug}\">#{title}</a></li>\n" | |
end | |
# Build chapter contents | |
chapter_contents = "" | |
chapter_pages.each do |key, chapter_page| | |
page = Nokogiri::HTML(chapter_page) | |
section = page.css('.section').first | |
slug = section.attributes["id"].content | |
title = section.css('h1').first.remove | |
chapter_contents << "<a id=\"#{slug}\"></a><div class=\"section\"><h1>#{title.text}</h1><div>#{section.inner_html}</div></div>" | |
end | |
File.open('rubythehardway.html', 'w') do |file| | |
file.write(final_markup % [table_of_contents, chapter_contents]) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment