Skip to content

Instantly share code, notes, and snippets.

@techbelly
Created August 7, 2012 11:20
Show Gist options
  • Save techbelly/3284608 to your computer and use it in GitHub Desktop.
Save techbelly/3284608 to your computer and use it in GitHub Desktop.
Old Bailey Scraper
require 'hpricot'
require 'open-uri'
class OldBaileyParser
def initialize
scrape_reports
parse_reports
end
def parse_reports
Dir.glob("#{RAILS_ROOT}/data/reports/*").each do |file|
puts file
file = open(file)
doc = Hpricot.XML(file.read)
main_div = doc.at('div#main2')
report_doc = Hpricot.XML(main_div.inner_text)
parse(report_doc)
end
# http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17070115.xml&div=t17070115-3&xml=yes
# http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/t17070115.xml&div=t17070115-1&xml=yes
end
def scrape_reports
file = open("http://www.oldbaileyonline.org/search.jsp?foo=bar&form=searchHomePage&count=0&_divs_div0Type_div1Type=sessionsPaper%7CtrialAccount&fromYear=1707&fromMonth=01&toYear=1707&toMonth=12&start=170")
file = open("http://www.oldbaileyonline.org/search.jsp?foo=bar&form=searchHomePage&_divs_div0Type_div1Type=sessionsPaper%7CtrialAccount&fromYear=1708&fromMonth=01&toYear=1708&toMonth=12&start=170&count=0")
index_doc = Hpricot.XML(file.read)
report_links = index_doc.search('a').select{ |ele| ele.attributes['href'] =~ /browse.jsp\?id=/}
id_pattern = /id=(.*?)&/
report_links.each do |report_link|
report_id = id_pattern.match(report_link.attributes['href'])[1]
report_id_base = report_id.split('-')[0].gsub('t', '')
report_url = "http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/#{report_id_base}.xml&div=#{report_id}&xml=yes"
puts report_url
report_text = open(report_url).read
puts report_text
f = open("#{RAILS_ROOT}/data/reports/#{report_id}", "w")
f.write(report_text)
f.close
end
end
def parse(doc)
offence_type = offence_type(doc)
offence = offence(doc)
punishment = punishment(doc)
punishment_type = punishment_type(doc)
date = date(doc)
if punishment and punishment_type
old_bailey_report = OldBaileyReport.create(:offence => offence,
:offence_type => offence_type,
:punishment => punishment,
:punishment_type => punishment_type,
:date => date)
end
end
def punishment(doc)
punishment_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'punishmentSubcategory'}
return nil unless punishment_tag
punishment_tag.attributes['value']
end
def punishment_type(doc)
punishment_type_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'punishmentCategory'}
return nil unless punishment_type_tag
punishment_type_tag.attributes['value']
end
def date(doc)
date_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'date'}
return nil unless date_tag
date_tag.attributes['value']
end
def offence(doc)
offence_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'offenceSubcategory'}
return nil unless offence_tag
offence_tag.attributes['value']
end
def offence_type(doc)
offence_type_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'offenceCategory'}
return nil unless offence_type_tag
offence_type_tag.attributes['value']
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment