Created
August 7, 2012 11:20
-
-
Save techbelly/3284608 to your computer and use it in GitHub Desktop.
Old Bailey Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'hpricot' | |
require 'open-uri' | |
class OldBaileyParser | |
def initialize | |
scrape_reports | |
parse_reports | |
end | |
def parse_reports | |
Dir.glob("#{RAILS_ROOT}/data/reports/*").each do |file| | |
puts file | |
file = open(file) | |
doc = Hpricot.XML(file.read) | |
main_div = doc.at('div#main2') | |
report_doc = Hpricot.XML(main_div.inner_text) | |
parse(report_doc) | |
end | |
# http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17070115.xml&div=t17070115-3&xml=yes | |
# http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/t17070115.xml&div=t17070115-1&xml=yes | |
end | |
def scrape_reports | |
file = open("http://www.oldbaileyonline.org/search.jsp?foo=bar&form=searchHomePage&count=0&_divs_div0Type_div1Type=sessionsPaper%7CtrialAccount&fromYear=1707&fromMonth=01&toYear=1707&toMonth=12&start=170") | |
file = open("http://www.oldbaileyonline.org/search.jsp?foo=bar&form=searchHomePage&_divs_div0Type_div1Type=sessionsPaper%7CtrialAccount&fromYear=1708&fromMonth=01&toYear=1708&toMonth=12&start=170&count=0") | |
index_doc = Hpricot.XML(file.read) | |
report_links = index_doc.search('a').select{ |ele| ele.attributes['href'] =~ /browse.jsp\?id=/} | |
id_pattern = /id=(.*?)&/ | |
report_links.each do |report_link| | |
report_id = id_pattern.match(report_link.attributes['href'])[1] | |
report_id_base = report_id.split('-')[0].gsub('t', '') | |
report_url = "http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/#{report_id_base}.xml&div=#{report_id}&xml=yes" | |
puts report_url | |
report_text = open(report_url).read | |
puts report_text | |
f = open("#{RAILS_ROOT}/data/reports/#{report_id}", "w") | |
f.write(report_text) | |
f.close | |
end | |
end | |
def parse(doc) | |
offence_type = offence_type(doc) | |
offence = offence(doc) | |
punishment = punishment(doc) | |
punishment_type = punishment_type(doc) | |
date = date(doc) | |
if punishment and punishment_type | |
old_bailey_report = OldBaileyReport.create(:offence => offence, | |
:offence_type => offence_type, | |
:punishment => punishment, | |
:punishment_type => punishment_type, | |
:date => date) | |
end | |
end | |
def punishment(doc) | |
punishment_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'punishmentSubcategory'} | |
return nil unless punishment_tag | |
punishment_tag.attributes['value'] | |
end | |
def punishment_type(doc) | |
punishment_type_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'punishmentCategory'} | |
return nil unless punishment_type_tag | |
punishment_type_tag.attributes['value'] | |
end | |
def date(doc) | |
date_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'date'} | |
return nil unless date_tag | |
date_tag.attributes['value'] | |
end | |
def offence(doc) | |
offence_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'offenceSubcategory'} | |
return nil unless offence_tag | |
offence_tag.attributes['value'] | |
end | |
def offence_type(doc) | |
offence_type_tag = doc.search('interp').detect{ |element| element.attributes['type'] == 'offenceCategory'} | |
return nil unless offence_type_tag | |
offence_type_tag.attributes['value'] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment