Created
August 12, 2011 17:49
-
-
Save basicxman/1142558 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # Script to convert the Learn Prolog Now! online HTML edition to a PDF. | |
| require 'nokogiri' | |
| require 'open-uri' | |
| require 'pdfkit' | |
| class HTMLBook | |
| attr_reader :content, :footer | |
| def initialize(root) | |
| @root = root + "/" | |
| @content = "" | |
| @crawled_pages = [] | |
| end | |
| def parse_url(url) | |
| File.basename(url).gsub(/#.*$/, '') | |
| end | |
| def add_and_crawl(page) | |
| doc = Nokogiri::HTML(open(@root + page)) | |
| add page, doc.dup | |
| doc.css("ul li").css("a").each do |link| | |
| add link.attr("href") | |
| end | |
| end | |
| def add(page, doc = nil) | |
| html_base_file = parse_url(page) | |
| return if @crawled_pages.include? html_base_file | |
| @crawled_pages << html_base_file | |
| doc ||= Nokogiri::HTML(open(@root + page)) | |
| # Remove next/up/prev links. | |
| doc.css("table.nav").each(&:remove) | |
| # Process footer. | |
| doc.css("address").each(&:remove) | |
| # Remove the extra horizantal rule. | |
| doc.css("hr").last.remove | |
| # Convert anchor link targets to local targets. | |
| doc.css("a").each do |link| | |
| if link.attr("href") | |
| next if link.attr("href").index("http://") | |
| link["href"] = "#" + link.attr("href").gsub(/^.*?#/, '') | |
| elsif link.attr("name") | |
| link["href"] = "#" + link.attr("name") | |
| end | |
| end | |
| # Append the content. | |
| @content += "<div id='#{html_base_file}'>" | |
| @content += doc.css("body").inner_html #.gsub(/[a-zA-Z0-9\.html#/, '') | |
| @content += "</div>" | |
| end | |
| end | |
| class Generator | |
| def initialize(book) | |
| kit = PDFKit.new(book.content) | |
| kit.to_file("learn_prolog_now.pdf") | |
| end | |
| end | |
| book = HTMLBook.new("http://cs.union.edu/~striegnk/learn-prolog-now/html") | |
| book.add_and_crawl("toc.html") | |
| Generator.new(book) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment