Skip to content

Instantly share code, notes, and snippets.

@robmckinnon
Created May 3, 2016 16:09
Show Gist options
  • Save robmckinnon/08f6dead0f094773e57b66dc4cdc6097 to your computer and use it in GitHub Desktop.
Save robmckinnon/08f6dead0f094773e57b66dc4cdc6097 to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'nokogiri'
require 'json'
require 'pry'
require 'date'
require 'mechanize'
module UtiacScraper
extend self
def scrape_and_publish
1.upto(1) do |page|
scrape_page page
end
end
def scrape_page page
puts page
html = open("https://tribunalsdecisions.service.gov.uk/utiac?page=#{page}").read
doc = Nokogiri::HTML html
items = doc.search('a').select{|x| x['href'][/utiac\//]}.map{|x| x['href']}.uniq
download items
end
def download items
items.each do |link|
puts link
html = open('https://tribunalsdecisions.service.gov.uk'+link)
doc = Nokogiri::HTML html
metadata = metadata(doc)
publish(metadata)
filename = link.split('/').last + '.json'
puts filename
File.open("json/#{filename}", 'w') {|f| f.write metadata.to_json}
end
end
def metadata doc
metadata = { 'Title' => doc.at('h1').text }
fields = doc.at('ul.decision-details').search('li')
fields.each_with_object(metadata) do |field, h|
h[ field.at('.label').text.chomp(':')] = field.at('span[2]').text.strip
end
attachments = doc.at('.download-links').search('a')
attachments.each_with_object(metadata) do |a, h|
h[ a['class'] ] = a['href']
end
metadata
end
def publish data
agent = Mechanize.new
page = agent.get('http://localhost:3064/utiac-decisions')
page = page.link_with(text: 'New document').click
form = page.form
form['utiac_decision[title]'] = data['Title']
form['utiac_decision[summary]'] = data['Case title']
form['utiac_decision[body]'] = 'N/A'
form['utiac_decision[promulgation_date]'] = date(data['Promulgation date'])
form['utiac_decision[country_guidance]'] = 'Not Country Guidance'
form['utiac_decision[country]'] = country(data['Country'])
form['utiac_decision[decision_reported]'] = reported(data['Status of case'])
judges = form.fields.select{|x| x.name[/judges/]}.detect{|x| x.class == Mechanize::Form::MultiSelectList}
judges.value = judges(data['Judges'])
result = form.submit
end
def date value
Date.parse(value).to_s
end
def country value
if value.to_s.strip.size == 0
'No Country'
else
case value
when 'Not applicable'
'No Country'
else
value
end
end
end
def reported value
if value.to_s.strip.size == 0
'Not Reported'
else
case value
when /^reported$/i
'Reported'
else
'Not Reported'
end
end
end
def judges(value)
names = value.split(',').map(&:strip)
end
end
UtiacScraper.scrape_and_publish
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment