Created
December 3, 2010 13:27
-
-
Save danbri/726948 to your computer and use it in GitHub Desktop.
read data from East Side Traders site
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -rubygems | |
require 'open-uri' | |
base='http://www.visiteastside.co.uk/search_eastside/index.php?name=&&&offset=0#searchresults' | |
poidb = {} | |
# This Ruby script will extract data from the pages of the Visit East Side site. | |
# | |
# Notes: | |
# Please play nice and only use the data for purposes that further the goals of | |
# Bristol East Side Traders. http://www.bristoleastsidetraders.co.uk/ | |
# "BEST (Bristol East Side Traders) is a not-for-profit company that aims to | |
# develop a thriving local economy in Bristol's inner city that benefits | |
# residents - as entrepreneurs, employees and customers." | |
# | |
# I haven't yet asked permission or forgiveness to do anything with this data. | |
# If you make something, please bear that in mind, and raise it with them before announcing. | |
# | |
# It pulls out the postcode field from address. For this see | |
# http://data.gov.uk/ideas/link-post-code-government-services | |
# http://www.epsiplus.net/news/news/ordnance_survey_postcodes_as_linked_data etc. | |
# | |
# Dan Brickley <[email protected]> | |
# Usage: | |
# TellyClub:hackbristol danbri$ ./eastsiderocks.rb | |
#1 BS5 9LT Clarks Pies 114b Church Road, Redfield, Bristol BS5 9LT | |
#2 BS5 9LH Claudette Reid Financial Consultancy 31 Verrier Road, Redfield, Bristol BS5 9LH | |
#3 BS5 9LJ Clear Financial Solutions 112a Church Road, Redfield, Bristol BS5 9LJ | |
#5 BS5 6NQ Coach House Pub 380 Stapleton Road, Easton, Bristol BS5 6NQ 12pm - late | |
#6 BS6 5NU Co-Creative 119a Ashley Road, Montpelier, Bristol BS6 5NU | |
# ...etc. | |
# | |
# Tech notes: you need to | |
# get a PHPSESSID from eg. Chrome browser, settings / under the hood / content settings / cookies | |
# I've included a session ID of mine in the script; not sure how long it'll work for. | |
myCookie = "PHPSESSID=166a38e8088a5ea0e39796b10aabcae1" | |
base = 'http://www.visiteastside.co.uk/search_eastside/details.php?trader_id=0&offset=' | |
hp = open('http://www.visiteastside.co.uk/search_eastside/index.php#searchresults') | |
0.step(700,1) do |i| | |
sleep 1 # play nice | |
u = String.new(base) | |
u.gsub!(/trader_id=0/, "trader_id=#{i}") | |
# puts "Fetching #{u}" | |
# :( Sorry, you need cookies enabled to use the search. Need to copy one in by hand for now: | |
open(u, "Cookie" => myCookie ) do |f| | |
page = f.read | |
next if page =~ /<h1>Organisation Not Found<\/h1>/ | |
myPoi = {} | |
poidb[i] = myPoi | |
# <h1>Search Eastside</h1><h2>Clarks Pies</h2> | |
if page =~ /<h1>Search Eastside<\/h1><h2>([^<]+)<\/h2>/ | |
myPoi['name']=$1 | |
end | |
# <p><span class="label">Address:</span> 114b Church Road, Redfield, Bristol BS5 9LT</p> | |
if page =~ /Address:<\/span>\s*([^<]+)<\/p>/ | |
# puts "ADDRESS! #{$1}" | |
myPoi['addr']=$1 | |
myPoi['addr'] =~ /\s*(BS\d+\s+\d+\S+)/ | |
# puts "POSTCODE: #{$1}" | |
myPoi['postcode'] = $1 | |
end | |
# <p><span class="label">Telephone:</span> 0117 955 6702</p> | |
if page =~ /Telephone:<\/span>\s*([^<]+)<\/p>/ | |
# puts "TEL! #{$1}" | |
myPoi['tel']=$1 | |
end | |
# <p><span class="label">Mobile:</span> </p> | |
if page =~ /Mobile:<\/span>\s*([^<]+)<\/p>/ | |
# puts "MOB! #{$1}" | |
myPoi['mob']=$1 | |
end | |
# <p><span class="label">Fax:</span> </p> | |
if page =~ /Fax:<\/span>\s*([^<]+)<\/p>/ | |
# puts "FAX! #{$1}" | |
myPoi['fax']=$1 | |
end | |
# <p><span class="label">Website:</span> </p> | |
if page =~ /Website:<\/span>\s*([^<]+)<\/p>/ | |
# puts "WEB! #{$1}" | |
myPoi['web']=$1 | |
end | |
# <p><span class="label">Opening Times:</span> </p> | |
if page =~ /Times:<\/span>\s*([^<]+)<\/p>/ | |
# puts "TIMES! #{$1}" | |
myPoi['times']=$1 | |
end | |
puts "#{i}\t#{myPoi['postcode']}\t#{myPoi['name']}\t#{myPoi['web']}\t#{myPoi['addr']}\t#{myPoi['times']}" | |
end | |
end | |
#poidb.each do |p| | |
# puts "#{p.dump}" | |
#end | |
# more re cookie handling - http://stackoverflow.com/questions/1360808/rubys-open-uri-and-cookies | |
# cookie | |
# content: 166a38e8088a5ea0e39796b10aabcae1 | |
# domain: www.visiteastside.co.uk | |
# path / |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment