Last active
October 28, 2021 10:57
-
-
Save PatrickLerner/5874015 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
# Version: 0.2a 2013-06-28 | |
require 'nokogiri' | |
require 'open-uri' | |
require 'tmpdir' | |
require 'trollop' | |
require 'rbconfig' | |
$is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin|mingw|cygwin/) | |
def clean_string (str) | |
str.tr('0-9', '0-9').sub('h2', 'h2').sub('h3', 'h3').sub('h4', 'h4') | |
end | |
def strip_element_tags (node, element_name) | |
node.search('.//' + element_name).each do |e| | |
e.replace e.inner_html | |
end | |
end | |
def strip_ruby_tags (node) | |
node.search('.//rt').remove | |
strip_element_tags(node, 'ruby') | |
end | |
class Article | |
def get_title (options = {}) | |
@doc.xpath(@XPath_title).each do |lines| | |
strip_ruby_tags lines if not options[:ruby] | |
return lines.content.to_s if options[:clean] | |
return clean_string(lines.to_s) | |
end | |
end | |
def get_date (options = {}) | |
@doc.xpath(@XPath_time).each do |lines| | |
strip_element_tags lines, 'span' | |
return clean_string(lines.to_s) | |
end | |
end | |
def get_content (options = {:ruby => false}) | |
@doc.xpath(@XPath_article).each do |lines| | |
strip_ruby_tags lines if not options[:ruby] | |
strip_element_tags lines, 'span' | |
strip_element_tags lines, 'a' | |
return clean_string(lines.inner_html.to_s) | |
end | |
end | |
end | |
class NHKEasyArticle < Article | |
def initialize (url) | |
@doc = Nokogiri::HTML(open(url)) | |
@XPath_title = '//*[@id="newstitle"]/h2' | |
@XPath_time = '//*[@id="newsDate"]' | |
@XPath_article = '//*[@id="newsarticle"]' | |
end | |
end | |
class NHKArticle < Article | |
def initialize (url) | |
@doc = Nokogiri::HTML(open(url)) | |
@XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span' | |
@XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div' | |
@XPath_article = '//*[@id="news"]/div[2]/div/div/div' | |
end | |
def get_title (options = {}) | |
super.gsub 'span', 'h2' | |
end | |
def get_date (options = {}) | |
super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>') | |
end | |
def get_content (options = {:ruby => false}) | |
c = '' | |
@doc.xpath(@XPath_article).each do |lines| | |
break if lines.attribute('id').to_s == "news_mkanren" | |
strip_ruby_tags lines if not options[:ruby] | |
strip_element_tags lines, 'span' | |
strip_element_tags lines, 'a' | |
c += clean_string(lines.inner_html.to_s) | |
end | |
c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">') | |
end | |
end | |
class HTMLOutput | |
def initialize (article, fileName, options = {}) | |
title = article.get_title(:ruby => false, :clean => true) | |
@horizontal_css = <<eos | |
body { | |
font-family: serif; } | |
h2, h3 { | |
font-weight: bold; | |
padding-top: 2em; | |
margin-right: 1em; | |
margin-left: 1em; } | |
h2 { | |
font-size: 120%; } | |
p { | |
text-indent: 1em; } | |
#newsDate { | |
font-size: 90%; | |
font-weight:bold; | |
line-height: 1.5; } | |
eos | |
@vertical_css = <<eos | |
body { | |
-webkit-writing-mode: vertical-rl; } | |
#newsDate { | |
padding-top: 10em; | |
text-indent: -4em; } | |
eos | |
@vertical_css = @horizontal_css + @vertical_css | |
@html_header = <<eos | |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
<html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
<meta http-equiv="Content-Style-Type" content="text/css" /> | |
<meta name="generator" content="pandoc" /> | |
<title>{{TITLE}}</title> | |
<link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" /> | |
<link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" /> | |
<meta name="DC.Title" content="{{TITLE}}" /> | |
<meta name="DC.Creator" content="NHK" /> | |
<meta name="DC.Publisher" content="NHK" /></head> | |
<body> | |
eos | |
@html_footer = <<eos | |
</body> | |
</html> | |
eos | |
@html_header.gsub! '{{TITLE}}', title | |
@html_header.gsub! '{{CSS_FILE}}', fileName + ".css" | |
File.open(fileName + ".css", 'w') { |file| | |
file.write(@horizontal_css) if options[:horizontal] | |
file.write(@vertical_css) if not options[:horizontal] | |
} | |
File.open(fileName + ".html", 'w') { |file| | |
file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css")) | |
file.write(article.get_title(options)) | |
file.write(article.get_date(options)) | |
file.write(article.get_content(options)) | |
file.write(@html_footer) | |
} | |
end | |
end | |
class KindleOutput | |
def initialize (article, fileName, options = {}) | |
title = article.get_title(:ruby => false, :clean => true) | |
@opf_file = <<eos | |
<?xml version="1.0" encoding="UTF-8"?> | |
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId"> | |
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/"> | |
<dc:title>{{TITLE}}</dc:title> | |
<dc:contributor>NHK</dc:contributor> | |
<dc:language>ja</dc:language> | |
<dc:publisher>NHK</dc:publisher> | |
</metadata> | |
<manifest> | |
<item id="style" href="{{CSS_FILE}}" media-type="text/css" /> | |
<item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" /> | |
</manifest> | |
<spine toc="tocncx" page-progression-direction="rtl"> | |
<itemref idref="titlepage" /> | |
</spine> | |
</package> | |
eos | |
@opf_file.gsub! '{{TITLE}}', title | |
@opf_file.gsub! '{{FILENAME}}', fileName | |
@opf_file.gsub! '{{CSS_FILE}}', fileName + ".css" | |
Dir.mktmpdir { |dir| | |
HTMLOutput.new(article, dir + "/" + fileName, options) | |
File.open(dir + "/" + fileName + ".opf", 'w') { |file| | |
file.write(@opf_file) | |
} | |
if $is_windows | |
system "kindlegen.exe \"#{dir + "/" + fileName}.opf\"" | |
else | |
system "kindlegen \"#{dir + "/" + fileName}.opf\"" | |
end | |
FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi" | |
} | |
end | |
end | |
# main part | |
opts = Trollop::options do | |
version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [[email protected]]" | |
banner <<-EOS | |
This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!). | |
Usage: | |
JapNewsToKindle [options] | |
where [options] are: | |
EOS | |
opt :ruby, "Get furigana if possible", :short => 'r' | |
opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u' | |
opt :out, "The output filename", :type => String, :short => 'O' | |
opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n' | |
opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o' | |
end | |
backends = [ | |
[/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle], | |
[/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle] | |
] | |
backends.each { |b| | |
if b[0].match(opts[:url]) | |
article = b[1].new(opts[:url]) | |
if opts[:out] | |
fileName = opts[:out] | |
else | |
fileName = article.get_title(:ruby => false, :clean => true) | |
end | |
KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]}) | |
if opts[:open] and not $is_windows | |
system "killall Kindle" | |
kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi" | |
FileUtils.rm kindleFilePath if File.exists? (kindleFilePath) | |
system "open \"#{fileName}.mobi\"" | |
end | |
exit | |
end | |
} | |
Trollop::die :url, "must match against a backend supported by this program" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
Could you please provide instructions to use this script? I am not familiar with Ruby so I don't know how to set it up. I installed the nokogiri gem but when I run:
ruby JapNewsToKindle
I get these errors: