Last active
December 13, 2015 21:58
-
-
Save mooware/4981372 to your computer and use it in GitHub Desktop.
Ruby script to scrape termine.orf.at for FM4 music events and turn the result into an RSS feed, with one post per run.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Ruby script to scrape termine.orf.at for FM4 music events and | |
# turn the result into an RSS feed. | |
# | |
# Usage: ruby fm4-musik.rb <result-file> <data-file> [-digest] | |
# | |
# The result file is where the feed xml will be written to, | |
# and the data file stores some state necessary to track | |
# the events that were already posted. | |
# If the digest flag is set, all events found in one run | |
# will be written into one post, otherwise each events | |
# gets its own post. | |
# MIT LICENSE: | |
# | |
# Copyright (c) 2013 Markus Pointner | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining | |
# a copy of this software and associated documentation files (the | |
# "Software"), to deal in the Software without restriction, including | |
# without limitation the rights to use, copy, modify, merge, publish, | |
# distribute, sublicense, and/or sell copies of the Software, and to | |
# permit persons to whom the Software is furnished to do so, subject to | |
# the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
require 'rubygems' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'rss/maker' | |
# --- config --- | |
# start date of the events to query | |
STARTDATE = Time.new | |
# end date of the events to query | |
ENDDATE = STARTDATE + (60 * 60 * 24 * 60) # + 60 days | |
# event category on the site | |
CATEGORY = "79" # music | |
# base url, also used as channel url for the rss feed | |
BASE_SITE = "http://termine.orf.at/fm4/index.php" | |
# first url to query, will also be used to fetch links to more pages | |
SITE = "#{BASE_SITE}?action=searchform&qtext=&countryId=&categoryId=#{CATEGORY}" + | |
"&startdate_date=#{STARTDATE.day}&startdate_month=#{STARTDATE.mon}&startdate_year=#{STARTDATE.year}" + | |
"&enddate_date=#{ENDDATE.day}&enddate_month=#{ENDDATE.mon}&enddate_year=#{ENDDATE.year}" | |
# --- end of config --- | |
# load the given url as nokogiri document | |
def load_document(url) | |
retries = 0 | |
while true | |
# fetch the first page | |
body = open(url) { |io| io.read } | |
if body[0..100].include? "SQL ERROR" # this happened sometimes | |
if retries < 3 | |
retries += 1 | |
next | |
else | |
return nil | |
end | |
end | |
# sadly, this site delivers poorly structured html, so we try to fix it up | |
body.gsub!("</tr>\n <td", "</tr>\n<tr>\n <td") | |
return Nokogiri::HTML(body) | |
end | |
rescue | |
nil | |
end | |
# fetch a list of events from the given nokogiri document | |
def fetch_events(doc) | |
# generate a hash for each event | |
doc.css(".listEventMainCell").map do |event| | |
title_tag = event.at(".eventTitle") | |
title = title_tag.inner_html | |
location_tag = event.parent.parent.at(".eventLocation") | |
location = location_tag.inner_html | |
date = event.at(".eventDate").inner_html | |
# these links are quite tricky, there may not even be any link at all | |
event_link_tag = event.search(".eventDescription a").last || | |
event.search(".eventUrl a").last | |
if event_link_tag | |
event_link = event_link_tag["href"] | |
elsif link and title | |
# improvise a unique title | |
event_link = link + "#" + title | |
else | |
next | |
end | |
# try to get an image | |
image_tag = event.parent.parent.at(".eventThumbnail img") | |
# throw out some garbage | |
# remove the "link" image | |
event.search("img") do |img| | |
img.swap "link" if img["src"].include? "link.gif" | |
end | |
# remove the little arrow before the date | |
event.search(".arrow").remove | |
# add breaks after title and description | |
title_tag.after("<br>") if title_tag | |
event.at(".eventDescription").after("<br>") if event.at(".eventDescription") | |
{ | |
:link => event_link, | |
:date => date, | |
:title => title, | |
:location => location, | |
:html => location_tag.to_s + image_tag.to_s + event.inner_html | |
} | |
end | |
rescue StandardError => e | |
[] | |
end | |
# remove any items that were already posted, and remove posts which are too old | |
def filter_old_items(new_items, old_posts, time_limit) | |
# filter out any items that were already posted | |
new_items.reject! do |item| | |
old_posts.any? do |post| | |
post[:items].any? do |old_item| | |
old_item[:date] == item[:date] && old_item[:title] == item[:title] | |
end | |
end | |
end | |
# remove posts and items that are too old | |
old_posts.reject! { |post| post[:time] < time_limit } | |
[new_items, old_posts] | |
end | |
# create the rss feed xml | |
def make_feed(posts, encoding) | |
# add the new post to the feed | |
feed = RSS::Maker.make("2.0") do |m| | |
m.channel.title = "FM4 Termine - Musik" | |
m.channel.link = BASE_SITE | |
m.channel.description = "Konzerte von fm4.orf.at" | |
m.items.do_sort = true # sort items by date | |
posts.each do |p| | |
post = m.items.new_item | |
post.title = p[:title] | |
post.link = p[:link] | |
post.date = p[:time] | |
post.description = p[:html] | |
end | |
end | |
# fix the feed encoding | |
feed.to_s.sub!(/encoding="[^"]+"/, "encoding=\"#{encoding}\"") | |
end | |
# convert an item hash to a line of html for the feed post | |
def item_to_html(item) | |
"<ul><a href=\"#{item[:link]}\">#{item[:date]} - #{item[:title]}</a> (#{item[:location]})</ul>" | |
end | |
# convert a list of item hashes into a single feed post hash | |
def make_digest_post(items, postdate, postlink) | |
# create the html for the new post | |
html = items.map { |item| item_to_html(item) }.join("\n") | |
# make a post hash | |
{ | |
:title => "FM4 Termine - #{postdate.strftime("%a %b %d %Y")}", | |
:link => postlink, | |
:items => items, | |
:time => postdate, | |
:html => html | |
} | |
end | |
# convert a list of item hashes into a list of event feed post hashes | |
def make_event_posts(items, postdate) | |
items.map do |item| | |
# use the same keys as for a digest post, to keep it compatible | |
{ | |
:title => "#{item[:date]} - #{item[:title]}", | |
:link => item[:link], | |
:items => [item], | |
:time => postdate, | |
:html => item[:html] | |
} | |
end | |
end | |
# do main stuff | |
def main | |
abort "Usage: #{File.basename($PROGRAM_NAME)} <result-file> <dump-file> [-digest]" if ARGV.count < 2 | |
resultfile = ARGV.shift | |
dumpfile = ARGV.shift | |
is_digest = (ARGV.shift == "-digest") | |
# fetch the first page, get the links to the other pages | |
puts "loading \"#{SITE}\"" | |
doc = load_document(SITE) | |
abort "failed to load \"#{SITE}\"" if doc.nil? | |
charset = doc.search("head meta").first[:content].match(/charset=(.+)/).captures.first | |
links = doc.search("#listPageNavigTop a").map { |a| BASE_SITE + a["href"] } | |
links.uniq! | |
puts "found #{links.count} links" | |
# we will use the first element as a dummy to process the first page | |
links.unshift nil | |
# fetch all events | |
items = links.map do |link| | |
doc = load_document(link) if link | |
fetch_events(doc) if doc | |
end | |
# clean up the item list | |
items.flatten! | |
items.compact! | |
abort "no events found" if items.empty? | |
puts "found #{items.count} events" | |
# get data from previous scrapes | |
posts = [] | |
if File.exist?(dumpfile) | |
posts = Marshal.load(File.read(dumpfile)) | |
puts "loaded #{posts.count} old posts" | |
end | |
# filter out any items that were already posted, | |
# and remove posts and items that are too old | |
datediff = ENDDATE - STARTDATE | |
olddate = STARTDATE - datediff | |
items, posts = filter_old_items(items, posts, olddate) | |
puts "#{items.count} events after filtering" | |
if items.empty? | |
puts "no new events, exiting" | |
exit 0 | |
end | |
# add the new post(s), create the feed | |
if is_digest | |
posts << make_digest_post(items, STARTDATE, SITE) | |
else | |
posts += make_event_posts(items, STARTDATE) | |
end | |
puts "writing #{posts.count} posts" | |
feedxml = make_feed(posts, charset) | |
# write the feed to an xml file | |
puts "writing feed file \"#{resultfile}\"" | |
File.open(resultfile, "w") do |io| | |
io << feedxml | |
end | |
# store the scraped data for next time | |
File.open(dumpfile, "w") do |io| | |
io << Marshal.dump(posts) | |
end | |
end | |
if __FILE__ == $0 | |
main | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment