Created
June 3, 2011 01:29
-
-
Save richardlehane/1005698 to your computer and use it in GitHub Desktop.
This script generates a CSV that can be used by Propublica's timeline-setter tool to make a nice timeline. It calls out to Wikipedia and Wragge's TROVE api to fill out the data provided by State Records NSW in the ministries.xml file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This script generates a CSV that can be used by Propublica's timeline-setter | |
# tool to make a nice timeline. It calls out to Wikipedia and Wragge's TROVE | |
# api to fill out the data provided by State Records NSW in the ministries.xml file. | |
# | |
# | |
require 'rubygems' | |
require 'nokogiri' | |
require 'net/http' | |
require 'date' | |
PROXY = 'MY_CORPORATE_FIREWALL' # You may not need these if you | |
PORT = 8080 # aren't accessing the web through a corporate firewall! | |
MONTHS = %w{January February March April May June July August September October November December} | |
LINK_ROOT = "http://investigator.records.nsw.gov.au/Entity.aspx?Path=\\Ministry\\" | |
# Retrieve table elements from wikipedia (to grab Labor and Liberal leader names) | |
def get_wiki_table path, query | |
wiki_string = Net::HTTP::Proxy(PROXY, PORT).start('en.wikipedia.org') {|http| http.get("/wiki/" + path).body} | |
wiki = Nokogiri::HTML(wiki_string) | |
table = wiki.xpath("//table[tr/th#{query}]") | |
end | |
# Wikipedia has a whole page devoted to NSW Labor leaders, they are pretty easy to scrape | |
lab_table = get_wiki_table "Leader_of_the_Australian_Labor_Party_in_New_South_Wales", "='Party leader'" | |
LABOR_LEADERS = lab_table.xpath("tr[td]").collect do |row| | |
full_name = row.xpath("td")[1].content | |
surname = full_name.split[-1] | |
premier = row.xpath("td")[4].content | |
premier.empty? ? nil : surname # filter out any NSW Labor leaders that weren't premiers | |
end.compact! | |
# ... for the Libs however we have to go to the Liberal Party's page and the scraping isn't as simple | |
lib_table = get_wiki_table "Liberal_Party_of_Australia", "/a='New South Wales'" | |
row = lib_table.xpath("tr[th/a='New South Wales']")[0].next_element | |
LIBERAL_LEADERS = Array.new | |
until row.at("th") | |
full_name = row.xpath("td")[0].content | |
if full_name | |
surname = full_name.split[-1] | |
LIBERAL_LEADERS << surname | |
end | |
row = row.next_element | |
end | |
# for all the pre-1955 ministries, call out to Wragge's unofficial TROVE API for relevant articles | |
def get_articles date, premier | |
premier = premier.downcase | |
query = '/api/newspapers/articles/?state=nsw&format=xml&article_type=news' | |
query += '&start_date=' + date | |
query += '&end_date=' + (Date.parse(date) + 1).to_s | |
query += '&exact=' + premier | |
articles = [] | |
wragge_string = Net::HTTP::Proxy(PROXY, PORT).start('wraggelabs.appspot.com') {|http| http.get(query).body} | |
wragge_xml = Nokogiri::XML(wragge_string) | |
wragge_xml.root.xpath("results/resource").each do |resource| | |
article = [] | |
article << resource.at('title').content | |
article << resource.at('url').content | |
articles << article | |
end | |
html = String.new | |
articles = articles[0..4] if articles[5] | |
articles.each do |article| | |
html += '<p><a href="' + article[1] + '">' + article[0] + '</a></p>' | |
end | |
html | |
end | |
# start constructing our CSV for timeline-setter | |
output = "date,display_date,description,link,,series,html\n" | |
# Now go through each Ministry in SRNSW's data and add it to the CSV | |
xml = Nokogiri::XML(File.open('ministries.xml')) | |
xml.root.search('Ministry').each do |ministry| | |
puts "..working" | |
date = ministry.at('Start_date').content[0..9] | |
# stick in the start date | |
output += date + ',' | |
# stick in a display date | |
year_int = date[0..3].to_i | |
month = MONTHS[date[5..6].to_i - 1] | |
day = date[8..9].to_i.to_s | |
output += day + ' ' + month + ' ' + year_int.to_s + ',,' | |
description = ministry.at('Ministry_title').content | |
output += LINK_ROOT + ministry.at('Ministry_number').content + ',,' | |
# check if we can classify our Premier according to our Wikipedia lists | |
premier = description.split(/\W/)[0] | |
if LABOR_LEADERS.index(premier) | |
output += 'labor,' | |
elsif LIBERAL_LEADERS.index(premier) | |
output += 'coalition,' | |
else | |
output += 'other,' | |
end | |
# add a heading and grab TROVE articles if pre-1955 | |
output += '"<H1>' + description + '</H1>' | |
if year_int < 1955 | |
articles = get_articles(date, premier) | |
articles.gsub!(/"/, '""') | |
output += articles | |
end | |
output += '"' | |
output += "\n" | |
end | |
File.open("output.csv", 'w') {|file| file.write(output)} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment