Last active
August 29, 2015 14:04
-
-
Save tuttinator/52268cffeec09665d852 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'capybara' | |
require 'capybara/dsl' | |
require 'capybara/mechanize' | |
module Parliament | |
class Question | |
attr_accessor :date, :number, :topic, :mp, :minister, :url | |
end | |
class Scraper | |
# Questions for Oral Answer - data structure | |
# Date | |
# Question Number | |
# Topic | |
# MP asking the question | |
# Minister asked | |
# URL | |
include Capybara::DSL | |
QUESTIONS_PER_PAGE = 20 | |
def initialize | |
Capybara.current_driver = :mechanize | |
Capybara.run_server = false | |
Capybara.app_host = "http://www.parliament.nz" | |
end | |
def find_number_of_pages | |
# This link is for every Question for Oral Answer since 2003, in groups of 20 questions per page | |
visit "/en-NZ/PB/Business/QOA/Default.htm?p=0&sort=PublicationDate&order=0" | |
# Assumes that it'll find this: <td class='summary'><h3>Questions for oral answer 1 to 20 of 9721</h3></td> | |
@total_questions = page.find("td.summary h3").text.match(/of ([\d]*)/)[1].to_i # should be a number like 9721 | |
# Return the number of page to loop through | |
# Cast to floats and then Floor'ed as the list is zero indexed | |
(@total_questions.to_f / QUESTIONS_PER_PAGE.to_f).floor | |
end | |
def scrape | |
questions = [] | |
(0..find_number_of_pages).each do |n| | |
visit "/en-NZ/PB/Business/QOA/Default.htm?p=#{n}&sort=PublicationDate&order=0" | |
page.all("table.listing tbody tr").each do |tr| | |
# There are entries that say "List of questions for oral answer" | |
# which are just lists of the primary questions | |
next if tr.find("h4 a").text =~ /List of questions for oral answer/ | |
question = Question.new | |
# Parse the date of the Question Time session | |
question.date = Date.parse(tr.find("td.attrPublicationDate").text) | |
# Assign the question number and topic | |
question.number, question.topic = tr.find("h4 a").text.split('.') | |
# get the link (probably the most valuable thing) | |
question.url = tr.find("h4 a")[:href] | |
begin | |
# Splits on the word ' to ' - first the MP asking the question | |
# second is the minister answering the question | |
question.mp, question.minister = tr.find("td p").text.split(' to ') | |
[question.mp, question.minister, question.topic].each { |q| q.strip! } | |
rescue | |
puts "-- Possibly a question to the select committee chairperson" | |
end | |
puts question.to_yaml | |
questions << question | |
end | |
end | |
puts questions.to_yaml | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment