Created
May 3, 2016 16:09
-
-
Save robmckinnon/08f6dead0f094773e57b66dc4cdc6097 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'json' | |
require 'pry' | |
require 'date' | |
require 'mechanize' | |
module UtiacScraper | |
extend self | |
def scrape_and_publish | |
1.upto(1) do |page| | |
scrape_page page | |
end | |
end | |
def scrape_page page | |
puts page | |
html = open("https://tribunalsdecisions.service.gov.uk/utiac?page=#{page}").read | |
doc = Nokogiri::HTML html | |
items = doc.search('a').select{|x| x['href'][/utiac\//]}.map{|x| x['href']}.uniq | |
download items | |
end | |
def download items | |
items.each do |link| | |
puts link | |
html = open('https://tribunalsdecisions.service.gov.uk'+link) | |
doc = Nokogiri::HTML html | |
metadata = metadata(doc) | |
publish(metadata) | |
filename = link.split('/').last + '.json' | |
puts filename | |
File.open("json/#{filename}", 'w') {|f| f.write metadata.to_json} | |
end | |
end | |
def metadata doc | |
metadata = { 'Title' => doc.at('h1').text } | |
fields = doc.at('ul.decision-details').search('li') | |
fields.each_with_object(metadata) do |field, h| | |
h[ field.at('.label').text.chomp(':')] = field.at('span[2]').text.strip | |
end | |
attachments = doc.at('.download-links').search('a') | |
attachments.each_with_object(metadata) do |a, h| | |
h[ a['class'] ] = a['href'] | |
end | |
metadata | |
end | |
def publish data | |
agent = Mechanize.new | |
page = agent.get('http://localhost:3064/utiac-decisions') | |
page = page.link_with(text: 'New document').click | |
form = page.form | |
form['utiac_decision[title]'] = data['Title'] | |
form['utiac_decision[summary]'] = data['Case title'] | |
form['utiac_decision[body]'] = 'N/A' | |
form['utiac_decision[promulgation_date]'] = date(data['Promulgation date']) | |
form['utiac_decision[country_guidance]'] = 'Not Country Guidance' | |
form['utiac_decision[country]'] = country(data['Country']) | |
form['utiac_decision[decision_reported]'] = reported(data['Status of case']) | |
judges = form.fields.select{|x| x.name[/judges/]}.detect{|x| x.class == Mechanize::Form::MultiSelectList} | |
judges.value = judges(data['Judges']) | |
result = form.submit | |
end | |
def date value | |
Date.parse(value).to_s | |
end | |
def country value | |
if value.to_s.strip.size == 0 | |
'No Country' | |
else | |
case value | |
when 'Not applicable' | |
'No Country' | |
else | |
value | |
end | |
end | |
end | |
def reported value | |
if value.to_s.strip.size == 0 | |
'Not Reported' | |
else | |
case value | |
when /^reported$/i | |
'Reported' | |
else | |
'Not Reported' | |
end | |
end | |
end | |
def judges(value) | |
names = value.split(',').map(&:strip) | |
end | |
end | |
UtiacScraper.scrape_and_publish |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment