mooware · December 13, 2015 21:58
diff --git a/fm4-musik.rb b/fm4-musik.rb
 #!/usr/bin/env ruby

 # Ruby script to scrape termine.orf.at for FM4 music events and
 # turn the result into an RSS feed.
 #
 # Usage: ruby fm4-musik.rb <result-file> <data-file> [-digest]
 #
 # The result file is where the feed xml will be written to,
 # and the data file stores some state necessary to track
 # the events that were already posted.
 # If the digest flag is set, all events found in one run
 # will be written into one post, otherwise each events
 # gets its own post.

 # MIT LICENSE:
 #
 # Copyright (c) 2013 Markus Pointner
 # 
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
 # "Software"), to deal in the Software without restriction, including
 # without limitation the rights to use, copy, modify, merge, publish,
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
 # 
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
 # 
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 require 'rubygems'
 require 'open-uri'
 require 'nokogiri'
 require 'rss/maker'



 # --- config ---
 # start date of the events to query
 STARTDATE = Time.new
 # end date of the events to query
 ENDDATE = STARTDATE + (60 * 60 * 24 * 60) # + 60 days 

 # event category on the site
 CATEGORY = "79" # music

 # base url, also used as channel url for the rss feed
 BASE_SITE = "http://termine.orf.at/fm4/index.php"

 # first url to query, will also be used to fetch links to more pages
 SITE = "#{BASE_SITE}?action=searchform&qtext=&countryId=&categoryId=#{CATEGORY}" + 
       "&startdate_date=#{STARTDATE.day}&startdate_month=#{STARTDATE.mon}&startdate_year=#{STARTDATE.year}" +
       "&enddate_date=#{ENDDATE.day}&enddate_month=#{ENDDATE.mon}&enddate_year=#{ENDDATE.year}"
 # --- end of config ---

 # load the given url as nokogiri document
 def load_document(url)
  retries = 0
  while true
    # fetch the first page
    body = open(url) { |io| io.read }

    if body[0..100].include? "SQL ERROR" # this happened sometimes
      if retries < 3
        retries += 1
        next
      else
        return nil
      end
    end

    # sadly, this site delivers poorly structured html, so we try to fix it up
    body.gsub!("</tr>\n   <td", "</tr>\n<tr>\n   <td")

    return Nokogiri::HTML(body)
  end
 rescue
  nil
 end

 # fetch a list of events from the given nokogiri document
 def fetch_events(doc)
  # generate a hash for each event
  doc.css(".listEventMainCell").map do |event|
    title_tag = event.at(".eventTitle")
    title = title_tag.inner_html

    location_tag = event.parent.parent.at(".eventLocation")
    location = location_tag.inner_html

    date = event.at(".eventDate").inner_html

    # these links are quite tricky, there may not even be any link at all
    event_link_tag = event.search(".eventDescription a").last ||
                     event.search(".eventUrl a").last
    if event_link_tag
      event_link = event_link_tag["href"]
    elsif link and title
      # improvise a unique title
      event_link = link + "#" + title
    else
      next
    end

    # try to get an image
    image_tag = event.parent.parent.at(".eventThumbnail img")

    # throw out some garbage

    # remove the "link" image
    event.search("img") do |img|
      img.swap "link" if img["src"].include? "link.gif"
    end

    # remove the little arrow before the date
    event.search(".arrow").remove

    # add breaks after title and description
    title_tag.after("<br>") if title_tag
    event.at(".eventDescription").after("<br>") if event.at(".eventDescription")

    {
      :link => event_link,
      :date => date,
      :title => title,
      :location => location,
      :html => location_tag.to_s + image_tag.to_s + event.inner_html
    }
  end
 rescue StandardError => e
  []
 end

 # remove any items that were already posted, and remove posts which are too old
 def filter_old_items(new_items, old_posts, time_limit)
  # filter out any items that were already posted
  new_items.reject! do |item|
    old_posts.any? do |post|
      post[:items].any? do |old_item|
        old_item[:date] == item[:date] && old_item[:title] == item[:title]
      end
    end
  end

  # remove posts and items that are too old
  old_posts.reject! { |post| post[:time] < time_limit }

  [new_items, old_posts]
 end

 # create the rss feed xml
 def make_feed(posts, encoding)
  # add the new post to the feed
  feed = RSS::Maker.make("2.0") do |m|
    m.channel.title = "FM4 Termine - Musik"
    m.channel.link = BASE_SITE
    m.channel.description = "Konzerte von fm4.orf.at"
    m.items.do_sort = true # sort items by date

    posts.each do |p|
      post = m.items.new_item
      post.title = p[:title]
      post.link = p[:link]
      post.date = p[:time]
      post.description = p[:html]
    end
  end

  # fix the feed encoding
  feed.to_s.sub!(/encoding="[^"]+"/, "encoding=\"#{encoding}\"")
 end

 # convert an item hash to a line of html for the feed post
 def item_to_html(item)
  "<ul><a href=\"#{item[:link]}\">#{item[:date]} - #{item[:title]}</a>&nbsp;(#{item[:location]})</ul>"
 end

 # convert a list of item hashes into a single feed post hash
 def make_digest_post(items, postdate, postlink)
  # create the html for the new post
  html = items.map { |item| item_to_html(item) }.join("\n")

  # make a post hash
  {
    :title => "FM4 Termine - #{postdate.strftime("%a %b %d %Y")}",
    :link => postlink,
    :items => items,
    :time => postdate,
    :html => html
  }
 end

 # convert a list of item hashes into a list of event feed post hashes
 def make_event_posts(items, postdate)
  items.map do |item|
    # use the same keys as for a digest post, to keep it compatible
    {
      :title => "#{item[:date]} - #{item[:title]}",
      :link => item[:link],
      :items => [item],
      :time => postdate,
      :html => item[:html]
    }
  end
 end

 # do main stuff
 def main
  abort "Usage: #{File.basename($PROGRAM_NAME)} <result-file> <dump-file> [-digest]" if ARGV.count < 2

  resultfile = ARGV.shift
  dumpfile = ARGV.shift
  is_digest = (ARGV.shift == "-digest")
 
  # fetch the first page, get the links to the other pages
  puts "loading \"#{SITE}\""

  doc = load_document(SITE)
  abort "failed to load \"#{SITE}\"" if doc.nil?

  charset = doc.search("head meta").first[:content].match(/charset=(.+)/).captures.first

  links = doc.search("#listPageNavigTop a").map { |a| BASE_SITE + a["href"] }
  links.uniq!

  puts "found #{links.count} links"

  # we will use the first element as a dummy to process the first page
  links.unshift nil

  # fetch all events
  items = links.map do |link|
    doc = load_document(link) if link
    fetch_events(doc) if doc
  end

  # clean up the item list
  items.flatten!
  items.compact!
  abort "no events found" if items.empty?
  puts "found #{items.count} events"

  # get data from previous scrapes
  posts = []
  if File.exist?(dumpfile)
    posts = Marshal.load(File.read(dumpfile))
    puts "loaded #{posts.count} old posts"
  end

  # filter out any items that were already posted,
  # and remove posts and items that are too old
  datediff = ENDDATE - STARTDATE
  olddate = STARTDATE - datediff

  items, posts = filter_old_items(items, posts, olddate)
  puts "#{items.count} events after filtering"

  if items.empty?
    puts "no new events, exiting"
    exit 0
  end

  # add the new post(s), create the feed
  if is_digest
    posts << make_digest_post(items, STARTDATE, SITE)
  else
    posts += make_event_posts(items, STARTDATE)
  end

  puts "writing #{posts.count} posts"
  feedxml = make_feed(posts, charset)

  # write the feed to an xml file
  puts "writing feed file \"#{resultfile}\""
  File.open(resultfile, "w") do |io|
    io << feedxml
  end

  # store the scraped data for next time
  File.open(dumpfile, "w") do |io|
    io << Marshal.dump(posts)
  end
 end



 if __FILE__ == $0
  main
 end
	#!/usr/bin/env ruby

	# Ruby script to scrape termine.orf.at for FM4 music events and
	# turn the result into an RSS feed.
	#
	# Usage: ruby fm4-musik.rb <result-file> <data-file> [-digest]
	#
	# The result file is where the feed xml will be written to,
	# and the data file stores some state necessary to track
	# the events that were already posted.
	# If the digest flag is set, all events found in one run
	# will be written into one post, otherwise each events
	# gets its own post.

	# MIT LICENSE:
	#
	# Copyright (c) 2013 Markus Pointner
	#
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of the Software, and to
	# permit persons to whom the Software is furnished to do so, subject to
	# the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	require 'rubygems'
	require 'open-uri'
	require 'nokogiri'
	require 'rss/maker'



	# --- config ---
	# start date of the events to query
	STARTDATE = Time.new
	# end date of the events to query
	ENDDATE = STARTDATE + (60 * 60 * 24 * 60) # + 60 days

	# event category on the site
	CATEGORY = "79" # music

	# base url, also used as channel url for the rss feed
	BASE_SITE = "http://termine.orf.at/fm4/index.php"

	# first url to query, will also be used to fetch links to more pages
	SITE = "#{BASE_SITE}?action=searchform&qtext=&countryId=&categoryId=#{CATEGORY}" +
	"&startdate_date=#{STARTDATE.day}&startdate_month=#{STARTDATE.mon}&startdate_year=#{STARTDATE.year}" +
	"&enddate_date=#{ENDDATE.day}&enddate_month=#{ENDDATE.mon}&enddate_year=#{ENDDATE.year}"
	# --- end of config ---

	# load the given url as nokogiri document
	def load_document(url)
	retries = 0
	while true
	# fetch the first page
	body = open(url) { \|io\| io.read }

	if body[0..100].include? "SQL ERROR" # this happened sometimes
	if retries < 3
	retries += 1
	next
	else
	return nil
	end
	end

	# sadly, this site delivers poorly structured html, so we try to fix it up
	body.gsub!("</tr>\n <td", "</tr>\n<tr>\n <td")

	return Nokogiri::HTML(body)
	end
	rescue
	nil
	end

	# fetch a list of events from the given nokogiri document
	def fetch_events(doc)
	# generate a hash for each event
	doc.css(".listEventMainCell").map do \|event\|
	title_tag = event.at(".eventTitle")
	title = title_tag.inner_html

	location_tag = event.parent.parent.at(".eventLocation")
	location = location_tag.inner_html

	date = event.at(".eventDate").inner_html

	# these links are quite tricky, there may not even be any link at all
	event_link_tag = event.search(".eventDescription a").last \|\|
	event.search(".eventUrl a").last
	if event_link_tag
	event_link = event_link_tag["href"]
	elsif link and title
	# improvise a unique title
	event_link = link + "#" + title
	else
	next
	end

	# try to get an image
	image_tag = event.parent.parent.at(".eventThumbnail img")

	# throw out some garbage

	# remove the "link" image
	event.search("img") do \|img\|
	img.swap "link" if img["src"].include? "link.gif"
	end

	# remove the little arrow before the date
	event.search(".arrow").remove

	# add breaks after title and description
	title_tag.after("<br>") if title_tag
	event.at(".eventDescription").after("<br>") if event.at(".eventDescription")

	{
	:link => event_link,
	:date => date,
	:title => title,
	:location => location,
	:html => location_tag.to_s + image_tag.to_s + event.inner_html
	}
	end
	rescue StandardError => e
	[]
	end

	# remove any items that were already posted, and remove posts which are too old
	def filter_old_items(new_items, old_posts, time_limit)
	# filter out any items that were already posted
	new_items.reject! do \|item\|
	old_posts.any? do \|post\|
	post[:items].any? do \|old_item\|
	old_item[:date] == item[:date] && old_item[:title] == item[:title]
	end
	end
	end

	# remove posts and items that are too old
	old_posts.reject! { \|post\| post[:time] < time_limit }

	[new_items, old_posts]
	end

	# create the rss feed xml
	def make_feed(posts, encoding)
	# add the new post to the feed
	feed = RSS::Maker.make("2.0") do \|m\|
	m.channel.title = "FM4 Termine - Musik"
	m.channel.link = BASE_SITE
	m.channel.description = "Konzerte von fm4.orf.at"
	m.items.do_sort = true # sort items by date

	posts.each do \|p\|
	post = m.items.new_item
	post.title = p[:title]
	post.link = p[:link]
	post.date = p[:time]
	post.description = p[:html]
	end
	end

	# fix the feed encoding
	feed.to_s.sub!(/encoding="[^"]+"/, "encoding=\"#{encoding}\"")
	end

	# convert an item hash to a line of html for the feed post
	def item_to_html(item)
	"<ul><a href=\"#{item[:link]}\">#{item[:date]} - #{item[:title]}</a> (#{item[:location]})</ul>"
	end

	# convert a list of item hashes into a single feed post hash
	def make_digest_post(items, postdate, postlink)
	# create the html for the new post
	html = items.map { \|item\| item_to_html(item) }.join("\n")

	# make a post hash
	{
	:title => "FM4 Termine - #{postdate.strftime("%a %b %d %Y")}",
	:link => postlink,
	:items => items,
	:time => postdate,
	:html => html
	}
	end

	# convert a list of item hashes into a list of event feed post hashes
	def make_event_posts(items, postdate)
	items.map do \|item\|
	# use the same keys as for a digest post, to keep it compatible
	{
	:title => "#{item[:date]} - #{item[:title]}",
	:link => item[:link],
	:items => [item],
	:time => postdate,
	:html => item[:html]
	}
	end
	end

	# do main stuff
	def main
	abort "Usage: #{File.basename($PROGRAM_NAME)} <result-file> <dump-file> [-digest]" if ARGV.count < 2

	resultfile = ARGV.shift
	dumpfile = ARGV.shift
	is_digest = (ARGV.shift == "-digest")

	# fetch the first page, get the links to the other pages
	puts "loading \"#{SITE}\""

	doc = load_document(SITE)
	abort "failed to load \"#{SITE}\"" if doc.nil?

	charset = doc.search("head meta").first[:content].match(/charset=(.+)/).captures.first

	links = doc.search("#listPageNavigTop a").map { \|a\| BASE_SITE + a["href"] }
	links.uniq!

	puts "found #{links.count} links"

	# we will use the first element as a dummy to process the first page
	links.unshift nil

	# fetch all events
	items = links.map do \|link\|
	doc = load_document(link) if link
	fetch_events(doc) if doc
	end

	# clean up the item list
	items.flatten!
	items.compact!
	abort "no events found" if items.empty?
	puts "found #{items.count} events"

	# get data from previous scrapes
	posts = []
	if File.exist?(dumpfile)
	posts = Marshal.load(File.read(dumpfile))
	puts "loaded #{posts.count} old posts"
	end

	# filter out any items that were already posted,
	# and remove posts and items that are too old
	datediff = ENDDATE - STARTDATE
	olddate = STARTDATE - datediff

	items, posts = filter_old_items(items, posts, olddate)
	puts "#{items.count} events after filtering"

	if items.empty?
	puts "no new events, exiting"
	exit 0
	end

	# add the new post(s), create the feed
	if is_digest
	posts << make_digest_post(items, STARTDATE, SITE)
	else
	posts += make_event_posts(items, STARTDATE)
	end

	puts "writing #{posts.count} posts"
	feedxml = make_feed(posts, charset)

	# write the feed to an xml file
	puts "writing feed file \"#{resultfile}\""
	File.open(resultfile, "w") do \|io\|
	io << feedxml
	end

	# store the scraped data for next time
	File.open(dumpfile, "w") do \|io\|
	io << Marshal.dump(posts)
	end
	end



	if __FILE__ == $0
	main
	end