Skip to content

Instantly share code, notes, and snippets.

@bouchard
Last active December 26, 2017 05:43
Show Gist options
  • Save bouchard/9d66b6a4fa20f7533bcc to your computer and use it in GitHub Desktop.
Save bouchard/9d66b6a4fa20f7533bcc to your computer and use it in GitHub Desktop.
Podcast XML/Builder Template for Middleman
page '/podcast.xml', layout: false
# Methods defined in the helpers block are available in templates
helpers do
def podcast_source_path(article)
"source/audio/#{article.data.file}"
end
end
title date file
Podcast Number One
2014-12-12 12:40 CST
one.mp3

Blah Blah.

# -*- coding: utf-8 -*-
require 'mp3info'
require 'nokogiri'
# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
class HtmlToPlainText
IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
WHITESPACE = [" ", "\n", "\r"].freeze
PLAINTEXT = "plaintext".freeze
PRE = "pre".freeze
BR = "br".freeze
HR = "hr".freeze
TD = "td".freeze
TH = "th".freeze
TR = "tr".freeze
OL = "ol".freeze
UL = "ul".freeze
LI = "li".freeze
A = "a".freeze
TABLE = "table".freeze
NUMBERS = ["1", "a"].freeze
ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
HTML_PATTERN = /[<&]/.freeze
TRAILING_WHITESPACE = /[ \t]+$/.freeze
BODY_TAG_XPATH = "/html/body".freeze
CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
LINE_BREAK_PATTERN = /[\n\r]/.freeze
NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
NOT_WHITESPACE_PATTERN = /\S/.freeze
SPACE = " ".freeze
EMPTY = "".freeze
NEWLINE = "\n".freeze
HREF = "href".freeze
TABLE_SEPARATOR = " | ".freeze
class << self
# Convert some HTML into a plain text approximation.
def truncate(str, truncate_at, options = {})
return str.dup unless str.length > truncate_at
options[:omission] ||= '...'
length_with_room_for_omission = truncate_at - options[:omission].length
stop = if options[:separator]
str.rindex(options[:separator], length_with_room_for_omission) || length_with_room_for_omission
else
length_with_room_for_omission
end
"#{str[0...stop]}#{options[:omission]}"
end
def plain_text(html)
return nil if html.nil?
return html.dup unless html =~ HTML_PATTERN
body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
return unless body
convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
end
private
# Convert an HTML node to plain text. This method is called recursively with the output and
# formatting options for special tags.
def convert_node_to_plain_text(parent, out = '', options = {})
if PARAGRAPH_TAGS.include?(parent.name)
append_paragraph_breaks(out)
elsif BLOCK_TAGS.include?(parent.name)
append_block_breaks(out)
end
format_list_item(out, options) if parent.name == LI
out << "| " if parent.name == TR && data_table?(parent.parent)
parent.children.each do |node|
if node.text? || node.cdata?
text = node.text
unless options[:pre]
text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
text.lstrip! if WHITESPACE.include?(out[-1, 1])
end
out << text
elsif node.name == PLAINTEXT
out << node.text
elsif node.element? && !IGNORE_TAGS.include?(node.name)
convert_node_to_plain_text(node, out, child_options(node, options))
if node.name == BR
out.sub!(TRAILING_WHITESPACE, EMPTY)
out << NEWLINE
elsif node.name == HR
out.sub!(TRAILING_WHITESPACE, EMPTY)
out << NEWLINE unless out.end_with?(NEWLINE)
out << "-------------------------------\n"
elsif node.name == TD || node.name == TH
out << (data_table?(parent.parent) ? TABLE_SEPARATOR : SPACE)
elsif node.name == A
href = node[HREF]
if href &&
href =~ ABSOLUTE_URL_PATTERN &&
node.text =~ NOT_WHITESPACE_PATTERN &&
node.text != href &&
node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:[email protected]">[email protected]</a>
out << " (#{href}) "
end
elsif PARAGRAPH_TAGS.include?(node.name)
append_paragraph_breaks(out)
elsif BLOCK_TAGS.include?(node.name)
append_block_breaks(out)
end
end
end
out
end
# Set formatting options that will be passed to child elements for a tag.
def child_options(node, options)
if node.name == UL
level = options[:ul] || -1
level += 1
options.merge(:list => :ul, :ul => level)
elsif node.name == OL
level = options[:ol] || -1
level += 1
options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
elsif node.name == PRE
options.merge(:pre => true)
else
options
end
end
# Add double line breaks between paragraph elements. If line breaks already exist,
# new ones will only be added to get to two.
def append_paragraph_breaks(out)
out.sub!(TRAILING_WHITESPACE, EMPTY)
if out.end_with?(NEWLINE)
out << NEWLINE unless out.end_with?("\n\n")
else
out << "\n\n"
end
end
# Add a single line break between block elements. If a line break already exists,
# none will be added.
def append_block_breaks(out)
out.sub!(TRAILING_WHITESPACE, EMPTY)
out << NEWLINE unless out.end_with?(NEWLINE)
end
# Add an appropriate bullet or number to a list element.
def format_list_item(out, options)
if options[:list] == :ul
out << "#{'*' * (options[:ul] + 1)} "
elsif options[:list] == :ol
number = options[:number]
options[:number] = number.next
out << "#{number}. "
end
end
def data_table?(table)
table.attributes['border'].to_s.to_i > 0
end
end
end
xml.instruct!
xml.rss 'xmlns:itunes' => 'http://www.itunes.com/dtds/podcast-1.0.dtd', 'xmlns:atom' => 'http://www.w3.org/2005/Atom', :version => '2.0' do
xml.channel do
xml.title config[:blog_title]
xml.description config[:blog_description]
xml.link URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml')
xml.atom :link, 'rel' => 'self', 'href' => URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml')
xml.language 'en-CA'
xml.lastBuildDate blog.articles.first.date.rfc2822
xml.pubDate blog.articles.first.date.rfc2822
xml.itunes :author, 'John Frank'
xml.itunes :keywords, config[:blog_keywords].join(', ')
xml.itunes :explicit, (config[:blog_clean] ? 'clean' : 'yes')
xml.itunes :image, :href => URI.join(config[:blog_url], image_path('icon.png'))
xml.itunes :summary, HtmlToPlainText.truncate(HtmlToPlainText.plain_text(config[:blog_description]), 1950)
xml.itunes :owner do
xml.itunes :name, 'John Frank'
xml.itunes :email, '[email protected]'
end
xml.itunes :category, :text => 'Science & Medicine' do
xml.itunes :category, :text => 'Medicine'
end
blog.articles.each do |article|
xml.item do
xml.title article.title
xml.pubDate article.date.rfc822
xml.enclosure :url => tracked_url(podcast_url(article)), :length => File.size(podcast_source_path(article)), :type => 'audio/mpeg'
xml.link URI.join(config[:blog_url], article.url)
xml.guid({ :isPermaLink => true }, URI.join(config[:blog_url], article.url))
xml.itunes :author, 'John Frank'
xml.itunes :summary do
xml.cdata! HtmlToPlainText.truncate(HtmlToPlainText.plain_text(article.body), 3950)
end
xml.itunes :duration, Mp3Info.new(podcast_source_path(article)).length.to_i
# xml.description do
# xml.cdata! article.body + partial(:audio_tag, :locals => { :article => article })
# end
# Most RSS readers will pull out the link to the enclosure, so no need to include it here.
xml.description do
xml.cdata! article.body
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment