Last active
August 29, 2015 13:57
-
-
Save meqif/9722515 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Downloads the entire 'Worm' story and builds an ebook out of it in HTML form, | |
# ready to be processed by Calibre into whatever format you need. | |
# | |
require 'nokogiri' | |
require 'open-uri' | |
INDEX_URL = "http://parahumans.wordpress.com/table-of-contents/" | |
# Fetch index | |
def fetch_index | |
doc = Nokogiri::HTML(open(INDEX_URL)) | |
# Get all links to chapters | |
links = doc.css(".entry-content") | |
# Remove sharing links | |
links.search(".sharedaddy").remove | |
links = links.css("a") | |
# Clean up link text and remove incorrect links | |
links.search("br").remove | |
links.search("strong").each { |node| node.replace(node.content) } | |
links = links.reject{ |link| link.text.empty? }.each do |link| | |
link.text.strip! | |
# Some urls lack the scheme | |
if not link['href'].match(/http:\/\/*/) | |
link['href'] = "http://#{link['href']}" | |
end | |
end | |
return links | |
end | |
# Fetch single page/chapter | |
def fetch_chapter(url) | |
doc = Nokogiri::HTML(open(url)) | |
# Get content | |
content = doc.css(".entry-content") | |
# Remove sharing links | |
content.search(".sharedaddy").remove | |
content.search("a").each do |link| | |
# Remove "Last Chapter" and "Next Chapter" links | |
if link.text.match(/(Next|Last) Chapter/) | |
link.remove | |
end | |
end | |
content.search("p").each do |par| | |
# Remove empty paragraphs | |
if par.text.gsub(/\u00A0/, '').strip.empty? | |
par.remove | |
end | |
end | |
return content | |
end | |
# Build complete HTML | |
def build | |
File.open("output.html", "w") do |f| | |
template =<<-END | |
<!doctype html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Worm</title> | |
</head> | |
<body> | |
<h1>Worm</h1> | |
END | |
f.write(template) | |
index = fetch_index | |
# Output index | |
f.write("<section class=\"toc\">\n") | |
f.write("<ul>\n") | |
index.each_with_index do |link, index| | |
f.write("<li><a href=\"#chapter_#{index}\">#{link.text}</a></li>\n") | |
end | |
f.write("</ul>\n</section>\n") | |
# Output chapters | |
index.each_with_index do |link, index| | |
title = link.text | |
chapter = fetch_chapter(URI.encode(link['href'])) | |
f.write("\n<section id=\"chapter_#{index}\" class=\"chapter\">\n<h1>#{title}</h1>#{chapter}\n</section>") | |
end | |
f.write("\n</body>\n</html>") | |
end | |
end | |
# Main | |
if __FILE__ == $PROGRAM_NAME | |
build | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment