title | date | file |
---|---|---|
Podcast Number One |
2014-12-12 12:40 CST |
one.mp3 |
Blah Blah.
page '/podcast.xml', layout: false | |
# Methods defined in the helpers block are available in templates | |
helpers do | |
def podcast_source_path(article) | |
"source/audio/#{article.data.file}" | |
end | |
end |
title | date | file |
---|---|---|
Podcast Number One |
2014-12-12 12:40 CST |
one.mp3 |
Blah Blah.
# -*- coding: utf-8 -*- | |
require 'mp3info' | |
require 'nokogiri' | |
# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation. | |
class HtmlToPlainText | |
IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze | |
PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze | |
BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze | |
WHITESPACE = [" ", "\n", "\r"].freeze | |
PLAINTEXT = "plaintext".freeze | |
PRE = "pre".freeze | |
BR = "br".freeze | |
HR = "hr".freeze | |
TD = "td".freeze | |
TH = "th".freeze | |
TR = "tr".freeze | |
OL = "ol".freeze | |
UL = "ul".freeze | |
LI = "li".freeze | |
A = "a".freeze | |
TABLE = "table".freeze | |
NUMBERS = ["1", "a"].freeze | |
ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze | |
HTML_PATTERN = /[<&]/.freeze | |
TRAILING_WHITESPACE = /[ \t]+$/.freeze | |
BODY_TAG_XPATH = "/html/body".freeze | |
CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze | |
LINE_BREAK_PATTERN = /[\n\r]/.freeze | |
NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze | |
NOT_WHITESPACE_PATTERN = /\S/.freeze | |
SPACE = " ".freeze | |
EMPTY = "".freeze | |
NEWLINE = "\n".freeze | |
HREF = "href".freeze | |
TABLE_SEPARATOR = " | ".freeze | |
class << self | |
# Convert some HTML into a plain text approximation. | |
def truncate(str, truncate_at, options = {}) | |
return str.dup unless str.length > truncate_at | |
options[:omission] ||= '...' | |
length_with_room_for_omission = truncate_at - options[:omission].length | |
stop = if options[:separator] | |
str.rindex(options[:separator], length_with_room_for_omission) || length_with_room_for_omission | |
else | |
length_with_room_for_omission | |
end | |
"#{str[0...stop]}#{options[:omission]}" | |
end | |
def plain_text(html) | |
return nil if html.nil? | |
return html.dup unless html =~ HTML_PATTERN | |
body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first | |
return unless body | |
convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE) | |
end | |
private | |
# Convert an HTML node to plain text. This method is called recursively with the output and | |
# formatting options for special tags. | |
def convert_node_to_plain_text(parent, out = '', options = {}) | |
if PARAGRAPH_TAGS.include?(parent.name) | |
append_paragraph_breaks(out) | |
elsif BLOCK_TAGS.include?(parent.name) | |
append_block_breaks(out) | |
end | |
format_list_item(out, options) if parent.name == LI | |
out << "| " if parent.name == TR && data_table?(parent.parent) | |
parent.children.each do |node| | |
if node.text? || node.cdata? | |
text = node.text | |
unless options[:pre] | |
text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE) | |
text.lstrip! if WHITESPACE.include?(out[-1, 1]) | |
end | |
out << text | |
elsif node.name == PLAINTEXT | |
out << node.text | |
elsif node.element? && !IGNORE_TAGS.include?(node.name) | |
convert_node_to_plain_text(node, out, child_options(node, options)) | |
if node.name == BR | |
out.sub!(TRAILING_WHITESPACE, EMPTY) | |
out << NEWLINE | |
elsif node.name == HR | |
out.sub!(TRAILING_WHITESPACE, EMPTY) | |
out << NEWLINE unless out.end_with?(NEWLINE) | |
out << "-------------------------------\n" | |
elsif node.name == TD || node.name == TH | |
out << (data_table?(parent.parent) ? TABLE_SEPARATOR : SPACE) | |
elsif node.name == A | |
href = node[HREF] | |
if href && | |
href =~ ABSOLUTE_URL_PATTERN && | |
node.text =~ NOT_WHITESPACE_PATTERN && | |
node.text != href && | |
node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:[email protected]">[email protected]</a> | |
out << " (#{href}) " | |
end | |
elsif PARAGRAPH_TAGS.include?(node.name) | |
append_paragraph_breaks(out) | |
elsif BLOCK_TAGS.include?(node.name) | |
append_block_breaks(out) | |
end | |
end | |
end | |
out | |
end | |
# Set formatting options that will be passed to child elements for a tag. | |
def child_options(node, options) | |
if node.name == UL | |
level = options[:ul] || -1 | |
level += 1 | |
options.merge(:list => :ul, :ul => level) | |
elsif node.name == OL | |
level = options[:ol] || -1 | |
level += 1 | |
options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2]) | |
elsif node.name == PRE | |
options.merge(:pre => true) | |
else | |
options | |
end | |
end | |
# Add double line breaks between paragraph elements. If line breaks already exist, | |
# new ones will only be added to get to two. | |
def append_paragraph_breaks(out) | |
out.sub!(TRAILING_WHITESPACE, EMPTY) | |
if out.end_with?(NEWLINE) | |
out << NEWLINE unless out.end_with?("\n\n") | |
else | |
out << "\n\n" | |
end | |
end | |
# Add a single line break between block elements. If a line break already exists, | |
# none will be added. | |
def append_block_breaks(out) | |
out.sub!(TRAILING_WHITESPACE, EMPTY) | |
out << NEWLINE unless out.end_with?(NEWLINE) | |
end | |
# Add an appropriate bullet or number to a list element. | |
def format_list_item(out, options) | |
if options[:list] == :ul | |
out << "#{'*' * (options[:ul] + 1)} " | |
elsif options[:list] == :ol | |
number = options[:number] | |
options[:number] = number.next | |
out << "#{number}. " | |
end | |
end | |
def data_table?(table) | |
table.attributes['border'].to_s.to_i > 0 | |
end | |
end | |
end | |
xml.instruct! | |
xml.rss 'xmlns:itunes' => 'http://www.itunes.com/dtds/podcast-1.0.dtd', 'xmlns:atom' => 'http://www.w3.org/2005/Atom', :version => '2.0' do | |
xml.channel do | |
xml.title config[:blog_title] | |
xml.description config[:blog_description] | |
xml.link URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml') | |
xml.atom :link, 'rel' => 'self', 'href' => URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml') | |
xml.language 'en-CA' | |
xml.lastBuildDate blog.articles.first.date.rfc2822 | |
xml.pubDate blog.articles.first.date.rfc2822 | |
xml.itunes :author, 'John Frank' | |
xml.itunes :keywords, config[:blog_keywords].join(', ') | |
xml.itunes :explicit, (config[:blog_clean] ? 'clean' : 'yes') | |
xml.itunes :image, :href => URI.join(config[:blog_url], image_path('icon.png')) | |
xml.itunes :summary, HtmlToPlainText.truncate(HtmlToPlainText.plain_text(config[:blog_description]), 1950) | |
xml.itunes :owner do | |
xml.itunes :name, 'John Frank' | |
xml.itunes :email, '[email protected]' | |
end | |
xml.itunes :category, :text => 'Science & Medicine' do | |
xml.itunes :category, :text => 'Medicine' | |
end | |
blog.articles.each do |article| | |
xml.item do | |
xml.title article.title | |
xml.pubDate article.date.rfc822 | |
xml.enclosure :url => tracked_url(podcast_url(article)), :length => File.size(podcast_source_path(article)), :type => 'audio/mpeg' | |
xml.link URI.join(config[:blog_url], article.url) | |
xml.guid({ :isPermaLink => true }, URI.join(config[:blog_url], article.url)) | |
xml.itunes :author, 'John Frank' | |
xml.itunes :summary do | |
xml.cdata! HtmlToPlainText.truncate(HtmlToPlainText.plain_text(article.body), 3950) | |
end | |
xml.itunes :duration, Mp3Info.new(podcast_source_path(article)).length.to_i | |
# xml.description do | |
# xml.cdata! article.body + partial(:audio_tag, :locals => { :article => article }) | |
# end | |
# Most RSS readers will pull out the link to the enclosure, so no need to include it here. | |
xml.description do | |
xml.cdata! article.body | |
end | |
end | |
end | |
end | |
end |