tokumine · May 23, 2012 21:07
diff --git a/mp_scrape.rb b/mp_scrape.rb
 require 'rubygems'
 require 'json'
 require 'net/http'
 require 'open-uri'
 require 'nokogiri'
 require 'date'
 require 'csv'

 page_counter = 1
 base_domain = "news.bbc.co.uk"
 base_path = "/democracylive/service/search?q=&type=representatives&institution=House+of+Commons&start=#{page_counter}&locale=en_GB" 
 mps = []

 while (page_counter < 66) do
  base_path = "/democracylive/service/search?q=&type=representatives&institution=House+of+Commons&start=#{page_counter}&locale=en_GB"   
  puts "http://#{base_domain}#{base_path}"  
  response = Net::HTTP.get_response(base_domain,base_path)
  
  html = Nokogiri::HTML(JSON.parse(response.body)["dl_representatives"])
  
  html.css('h2 a').each do |mp|
    
    doc = Nokogiri::HTML(open(mp.attr('href')))
    
    p = {}
    
    # NAME
    begin
      p[:name] = doc.css('h1 span.fn').first.content.gsub(/^\p{Space}+|\p{Space}+$/, "")
    rescue
      p[:name] = nil
    end
    
    # TITLE ABBR    
    begin
      p[:title] = doc.css('h1 span.title abbr').first.content.gsub(/^\p{Space}+|\p{Space}+$/, "")
    rescue
      p[:title] = nil
    end
    
    # TITLE LONGHAND
    begin
      p[:title_long] = doc.css('h1 span.title abbr').first.attr('title').gsub(/^\p{Space}+|\p{Space}+$/, "")    
    rescue
      p[:title_long] = nil
    end
    
    # CONSTITUENCY
    begin
      p[:constituency] = doc.css('h1 span.title span').first.content.gsub(/^\p{Space}+|\p{Space}+$/, "").split(',')[0]
    rescue
      p[:constituency] = nil
    end
      
    # PARTY
    begin
      p[:party] = doc.css('h1 span.title span').first.content.gsub(/^\p{Space}+|\p{Space}+$/, "").split(',')[1].gsub(/^\p{Space}+|\p{Space}+$/, "")
    rescue
      p[:party] = nil
    end  
    
    # CONSTITUENCY ADDRESS
    begin
      addr = doc.css('li.address')    
      if addr.css('h3').first.content == "Constituency address"      
        p[:addr] = addr.css('span.street-address').first.content
        p[:locality] = addr.css('span.locality').first.content      
        p[:postcode] = addr.css('span.postal-code').first.content
      end
    rescue
      p[:addr] = nil
      p[:locality] = nil
      p[:postcode] = nil
    end
    
    
    # TELEPHONE
    begin
      p[:telephone] = doc.css('ul li h3 span.tel').first.content
    rescue
      p[:telephone] = nil
    end
    
    # EMAIL
    begin
      p[:email] = doc.css('ul li h3 a.email').first.content
    rescue
      p[:email] = nil
    end
    
    # DOB
    begin
      dob = doc.css('div.content-object-2 ul li.li-b').first.content.gsub(/^\p{Space}+|\p{Space}+$/, "")
      dobsplit = dob.split("\n")
      dob_date = Date.parse dobsplit[1]
      p[:dob] = dob_date.to_s
    rescue
      p[:dob] = nil
    end
    
    # AGE
    begin
      age = (Date.today - dob_date).to_i
      p[:age] = (age/365.25).to_i
    rescue
      p[:age] = nil
    end
    
    mps << p
    puts "#{page_counter}: #{p[:name]}"
  end
  page_counter += 1  
 end

 # write hash to CSV  
 headers= mps.first.keys
 CSV.open("mps.csv", "wb", headers: headers) do |csv|
  csv << headers
  mps.each do |mp|      
    data_array = []
    headers.each {|h| data_array << mp[h]}
    csv << data_array       
  end
 end
	require 'rubygems'
	require 'json'
	require 'net/http'
	require 'open-uri'
	require 'nokogiri'
	require 'date'
	require 'csv'

	page_counter = 1
	base_domain = "news.bbc.co.uk"
	base_path = "/democracylive/service/search?q=&type=representatives&institution=House+of+Commons&start=#{page_counter}&locale=en_GB"
	mps = []

	while (page_counter < 66) do
	base_path = "/democracylive/service/search?q=&type=representatives&institution=House+of+Commons&start=#{page_counter}&locale=en_GB"
	puts "http://#{base_domain}#{base_path}"
	response = Net::HTTP.get_response(base_domain,base_path)

	html = Nokogiri::HTML(JSON.parse(response.body)["dl_representatives"])

	html.css('h2 a').each do \|mp\|

	doc = Nokogiri::HTML(open(mp.attr('href')))

	p = {}

	# NAME
	begin
	p[:name] = doc.css('h1 span.fn').first.content.gsub(/^\p{Space}+\|\p{Space}+$/, "")
	rescue
	p[:name] = nil
	end

	# TITLE ABBR
	begin
	p[:title] = doc.css('h1 span.title abbr').first.content.gsub(/^\p{Space}+\|\p{Space}+$/, "")
	rescue
	p[:title] = nil
	end

	# TITLE LONGHAND
	begin
	p[:title_long] = doc.css('h1 span.title abbr').first.attr('title').gsub(/^\p{Space}+\|\p{Space}+$/, "")
	rescue
	p[:title_long] = nil
	end

	# CONSTITUENCY
	begin
	p[:constituency] = doc.css('h1 span.title span').first.content.gsub(/^\p{Space}+\|\p{Space}+$/, "").split(',')[0]
	rescue
	p[:constituency] = nil
	end

	# PARTY
	begin
	p[:party] = doc.css('h1 span.title span').first.content.gsub(/^\p{Space}+\|\p{Space}+$/, "").split(',')[1].gsub(/^\p{Space}+\|\p{Space}+$/, "")
	rescue
	p[:party] = nil
	end

	# CONSTITUENCY ADDRESS
	begin
	addr = doc.css('li.address')
	if addr.css('h3').first.content == "Constituency address"
	p[:addr] = addr.css('span.street-address').first.content
	p[:locality] = addr.css('span.locality').first.content
	p[:postcode] = addr.css('span.postal-code').first.content
	end
	rescue
	p[:addr] = nil
	p[:locality] = nil
	p[:postcode] = nil
	end


	# TELEPHONE
	begin
	p[:telephone] = doc.css('ul li h3 span.tel').first.content
	rescue
	p[:telephone] = nil
	end

	# EMAIL
	begin
	p[:email] = doc.css('ul li h3 a.email').first.content
	rescue
	p[:email] = nil
	end

	# DOB
	begin
	dob = doc.css('div.content-object-2 ul li.li-b').first.content.gsub(/^\p{Space}+\|\p{Space}+$/, "")
	dobsplit = dob.split("\n")
	dob_date = Date.parse dobsplit[1]
	p[:dob] = dob_date.to_s
	rescue
	p[:dob] = nil
	end

	# AGE
	begin
	age = (Date.today - dob_date).to_i
	p[:age] = (age/365.25).to_i
	rescue
	p[:age] = nil
	end

	mps << p
	puts "#{page_counter}: #{p[:name]}"
	end
	page_counter += 1
	end

	# write hash to CSV
	headers= mps.first.keys
	CSV.open("mps.csv", "wb", headers: headers) do \|csv\|
	csv << headers
	mps.each do \|mp\|
	data_array = []
	headers.each {\|h\| data_array << mp[h]}
	csv << data_array
	end
	end
No results found