Created
August 27, 2010 07:16
-
-
Save rhulse/552971 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is used to 'line parse' schedules that are create in MS Word. | |
# It build on the gist http://gist.github.com/552955 | |
# Once the HTML is cleaned up in core, it is passed into a class | |
# based on the type of document. | |
# Each class works through a schedule document line-by-line, determining | |
# the context - what is the day and event | |
# These events are stored and imported into the main CMS. | |
# The checksum routine provides an internal check to ensure that | |
# the data collected and converted into objects is accurate. | |
# I consider this code to be a 'glorious' hack. It is stable most | |
# of the time, parsing documents with a set structure. | |
# It is finely tuned for our specific purpose, thus if the document does | |
# not match the expected structure, it fails. | |
# The code is provided in the hope that someone may find the techniques developed | |
# Earlier versions of this code (3) were written in PHP and have been in weekly use | |
# since at least 2005. | |
# This is the third generation of Ruby based code. | |
# At the end are some classes that we used re-parse schedules from live | |
# site so we could insert this information into our new Rails-based CMS. | |
# Richard Hulse. 27 August 2010 | |
# Copyright (c) Radio New Zealand Limited 2010 | |
# MIT license | |
# Permission is hereby granted, free of charge, to any person obtaining | |
# a copy of this software and associated documentation files (the | |
# "Software"), to deal in the Software without restriction, including | |
# without limitation the rights to use, copy, modify, merge, publish, | |
# distribute, sublicense, and/or sell copies of the Software, and to | |
# permit persons to whom the Software is furnished to do so, subject to | |
# the following conditions: | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
require 'html_parser_core' | |
class ScheduleParser < ParserCore | |
def initialize() | |
# REGEXS for detecting the current state and context | |
@line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday)(?:,?)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i | |
@line_is_event_heading_regexp = /<p><b>/i | |
@time_delimiter = '\.' | |
# tracking the count of days (used as a checksum - should be 7) | |
@day_count = 0 | |
@expected_day_count = 7 | |
# the hash holds data for the current week, keyed by current_event_id (epoch time) | |
@week = {} | |
# the current html being worked on | |
@html = '' | |
super | |
end | |
# these are called in derived classes to do extra specific things | |
def pre_tidy_cleanup | |
end | |
def post_tidy_cleanup | |
end | |
# this is called to change lines if they need it | |
# prior to being checked for context | |
def pre_test_cleanup(line) | |
line | |
end | |
# line content which we may want to skip over | |
# i.e. not include in the parse | |
def is_content_to_skip(line) | |
end | |
# the main parsing method | |
def parse(dirty_html) | |
@html = dirty_html.clone | |
@week = {} | |
pre_tidy_cleanup | |
@html = tidy_html(@html) | |
post_tidy_cleanup | |
current_date_id = '' | |
current_event_id = '' | |
current_date_string = '' | |
date = Time.now | |
index = 0 | |
last_hour = 0 | |
@html.each_line do |line| | |
if is_content_to_skip line | |
next | |
end | |
line = pre_test_cleanup line | |
case line | |
# the date title for a page | |
when @line_has_date_regexp : | |
@day_count += 1 | |
index = 0 # reset the index for the start of a new day | |
last_hour = 0 # new day | |
current_date_string = $1 | |
date = Time.parse(current_date_string) | |
# a numerical version of the date to base the day's events on | |
current_date_id = date.to_i | |
when @line_is_event_heading_regexp : | |
line =~ /((\d{1,2})#{@time_delimiter}(\d{2}))/ | |
this_hour = $2.to_i | |
this_minute = $3.to_i | |
this_time = "%02d.%02d" % [this_hour, this_minute] # '%I.%M' format for testing | |
if index == 0 && this_hour == 12 # then it is midnight | |
this_hour = 0 | |
end | |
index += 1 | |
if last_hour > this_hour # then we passed midday | |
this_hour += 12 | |
end | |
last_hour = this_hour # keep track of the previous hour | |
# set the time to midnight + the time of the event | |
current_event_id = current_date_id + (this_hour * 60 * 60) + (this_minute * 60) | |
# midnight is the next day, so we have to fix the check data | |
if this_hour == 24 | |
# midnight is 12 am in 12 hours clock time | |
this_time = "00.00" | |
# fake the captured string to the real (next) day | |
current_date_string = Time.at(current_event_id).strftime('%A %e %B %Y').gsub(/\s+/, ' ') | |
end | |
@week[current_event_id] = {} | |
@week[current_event_id][:body] ||= '' | |
@week[current_event_id][:time] = this_time | |
@week[current_event_id][:day] = current_date_string | |
@week[current_event_id][:title] = strip_time(strip_tags(line)) | |
else | |
description = format_description(line) | |
@week[current_event_id][:body] += description | |
end | |
end | |
@week | |
end | |
def checksum_is_ok? | |
# this checks that the date and time values extracted | |
# matche the parsed and converted time values | |
@week.sort.each do |index, event| | |
was_time = Time.at(index).strftime('%I.%M').strip | |
am_pm = Time.at(index).strftime('%p') | |
# midnight is a special case | |
if (was_time == '12.00') && (am_pm == 'AM') | |
was_time = '00.00' | |
end | |
was_date = Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ') | |
if event[:time] != was_time | |
@error_messages << "time parsing error on #{event[:title]}" | |
@error_messages << " captured => #{event[:time]} on #{event[:day]}" | |
@error_messages << " calculated => #{was_time} on #{was_date}" | |
end | |
if event[:day] != Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ') | |
@error_messages << "day parsing error on on #{event[:title]} => #{event[:day]}" | |
end | |
end | |
if @day_count != @expected_day_count | |
@error_messages << "the number of days is #{@day_count} when it should be 7" | |
end | |
return true if @error_messages.count == 0 | |
false | |
end | |
def errors | |
@error_messages | |
end | |
end | |
class NationalScheduleParser < ScheduleParser | |
def initialize | |
super | |
end | |
def pre_tidy_cleanup | |
@html.gsub! /<strong>/ , '<b>' | |
@html.gsub! /<\/strong>/ , '</b>' | |
@html.gsub! /<em>/ , '<em>' | |
@html.gsub! /<\/em>/ , '</em>' | |
end | |
def post_tidy_cleanup | |
email_regex = /Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i | |
#remove email address lines | |
@html.gsub! email_regex , '' | |
# Fix all night programe title | |
@html.gsub! /<p>(12\.\d{2}) All Night Programme/, '<p><b>\1 All Night Programme</b>' | |
@html.gsub! /<p><\/p>/ , '' | |
# misc - fix butted times | |
@html.gsub! /(\d)am / , '\1 am ' | |
@html.gsub! /(\d)pm / , '\1 pm ' | |
end | |
def is_content_to_skip(line) | |
case line | |
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ : | |
true | |
else | |
false | |
end | |
end | |
def parse(html) | |
super(html) | |
end | |
end | |
class NationalScheduleLiveParser < ScheduleParser | |
def initialize | |
super | |
@line_is_event_heading_regexp = /<h4>/i | |
@expected_day_count = 1 | |
@time_delimiter = ':' | |
end | |
def pre_tidy_cleanup | |
@html.gsub! /h4> / , 'h4>' | |
end | |
def post_tidy_cleanup | |
# Fix all night programe title | |
@html.gsub! /<p>(\d{1,2}:\d{2}) All Night Programme/, '<h4>\1 All Night Programme' | |
# fix up some old style schedules - see skip below | |
if @html =~ /<h2>Programme Schedule<\/h2>/ | |
@html.gsub! /<h3>/, '<h2>' | |
@html.gsub! /<\/h3>/, '</h2>' | |
end | |
end | |
def is_content_to_skip(line) | |
case line | |
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ : | |
true | |
when /If you wish to adapt our programme schedules/ : | |
true | |
when /Programme Schedules are licensed under the Creative Commons/ : | |
true | |
when /Please identify us as author/ | |
true | |
when /<h2>Programme Schedule<\/h2>/ | |
true | |
when /<h3>/ | |
true | |
else | |
false | |
end | |
end | |
def parse(html) | |
super(html) | |
end | |
end | |
class ConcertScheduleParser < ScheduleParser | |
def initialize | |
super | |
# date regexp must have an h2 at the start to seperate items that have | |
# text such as "recorded on Saturday 14 May 2010" | |
# this makes the parser think we are on a new day | |
# @line_has_date_regexp = /h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i | |
# Concert uses a different heading style | |
@line_is_event_heading_regexp = /<h3>/i | |
end | |
def post_tidy_cleanup | |
@html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1' | |
end | |
def is_content_to_skip(line) | |
case line | |
when /News & Weather:/i | |
true | |
when /New Zealand Music Week/i | |
true | |
else | |
false | |
end | |
end | |
def parse(html) | |
super(html) | |
end | |
end | |
class ConcertScheduleLiveParser < ScheduleParser | |
def initialize | |
super | |
# date regexp must have an h2 at the start to seperate items that have | |
# text such as "recorded on Saturday 14 May 2010" | |
# this makes the parser think we are on a new day | |
@line_has_date_regexp = /<h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i | |
# Concert uses a different heading style | |
@line_is_event_heading_regexp = /<h4>/i | |
@expected_day_count = 1 | |
@time_delimiter = ':' | |
end | |
def pre_test_cleanup(line) | |
if line =~ /approx / | |
line = strip_tags line | |
line = line.gsub /approx/, '(approx)' | |
line = "<p>#{line}</p>\n" | |
elsif line =~ /Disc 1/ | |
line = "<p>12.00 - Disc 1</p>\n" | |
end | |
line | |
end | |
def is_content_to_skip(line) | |
case line | |
when /News & Weather: / : | |
true | |
when /If you wish to adapt our programme schedules/ : | |
true | |
when /Programme Schedules are licensed under the Creative Commons/ : | |
true | |
when /Please identify us as author/ | |
true | |
when /Waitangi Day/ | |
true | |
when /<h3>/ | |
true | |
when /New Zealand Music Week/i | |
true | |
else | |
false | |
end | |
end | |
def parse(html) | |
super(html) | |
end | |
end | |
# A description is a non heading line that describes the programme or its contents | |
def format_description(description, process_brackets=true) | |
description.gsub!(/<h2>/ , '') | |
description.gsub!(/<br[^>]*?\/>/ , '') | |
description.gsub!(/ RR/ , '') | |
description.gsub!(/((\d{1,2})(\.|:)(\d{2})) / , '<strong>\2:\4</strong> ') | |
if process_brackets | |
description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>') | |
end | |
description.strip | |
end | |
def check_for_smarttags(html) | |
html =~ /<\/o:smarttagtype>/ | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment