tommorris · August 21, 2009 14:26
diff --git a/gistfile1.rb b/gistfile1.rb
 require "rubygems"
 require "nokogiri"
 require "open-uri"

 # last week, I started writing this code to parse data from search.ucas.com
 # to help some of the people going to Young Rewired State. It looks like I'm
 # not going to be able to go to YRS as I'm busy with academic work. I'd
 # really appreciate it if someone could help them out.

 # you don't have to use Ruby - it's just my hack language of choice. this
 # is just screen scraping code, so feel free to rewrite it in Perl or Python
 # or whatever you prefer. All you need to know with this code is that it uses
 # a mixture of XPath and CSS to search the DOM. 'collect' runs the block
 # provided across every member of the array and replaces it with the result.
 # so:
  # [1,2,3].collect {|i| i * 2 }
 # would return:
  # [2,4,6]

 # NOTE: you need to write some code that gets a new browsing state code
 # basically by going to the website and you'll get redirected to a search page.

 # first we get the data and load it as a Nokogiri HTML document
 data = Nokogiri::HTML.parse(open("http://search.ucas.com/cgi-bin/hsrun/search/search/StateId/DwPCio6PjmENd6USbOvaEWMMV5H6s-UcGa/HAHTpage/search.HsKeywordSuggestion.whereNext?query=532&word=PHILOSOPHY&single=Y").readlines.join)

 data.search("//td[@width='400']").to_a.collect {|i| i if !(i.search("a.bodyLink").first.nil?) }.compact.collect {|i|
  {
    :href   => "http://search.ucas.com" + i.search("a.bodyLink").first.attributes['href'],
    :title  => i.search("a.bodyLink")[0].content,
    :code   => i.search("span.bodyTextSmallGrey").first.content.gsub(/\(([A-Za-z0-9]+)\)/, "\\1")
  }
 }

 # once you've got the HREF for each course, you load it up and extract any
 # data from it that would be useful to help find university courses.
 # eventually, I thought it'd be pretty neat to merge this data with the data
 # available on dbpedia/freebase etc.

 # sorry I can't attend. I really wish I could. I hope it goes well and will
 # be watching what goes on through twitter/blogs etc.

 # if nobody gets around to doing this, I will get around to writing a UCAS
 # parser - BUT not until next month.
 #
 # -tom
	require "rubygems"
	require "nokogiri"
	require "open-uri"

	# last week, I started writing this code to parse data from search.ucas.com
	# to help some of the people going to Young Rewired State. It looks like I'm
	# not going to be able to go to YRS as I'm busy with academic work. I'd
	# really appreciate it if someone could help them out.

	# you don't have to use Ruby - it's just my hack language of choice. this
	# is just screen scraping code, so feel free to rewrite it in Perl or Python
	# or whatever you prefer. All you need to know with this code is that it uses
	# a mixture of XPath and CSS to search the DOM. 'collect' runs the block
	# provided across every member of the array and replaces it with the result.
	# so:
	# [1,2,3].collect {\|i\| i * 2 }
	# would return:
	# [2,4,6]

	# NOTE: you need to write some code that gets a new browsing state code
	# basically by going to the website and you'll get redirected to a search page.

	# first we get the data and load it as a Nokogiri HTML document
	data = Nokogiri::HTML.parse(open("http://search.ucas.com/cgi-bin/hsrun/search/search/StateId/DwPCio6PjmENd6USbOvaEWMMV5H6s-UcGa/HAHTpage/search.HsKeywordSuggestion.whereNext?query=532&word=PHILOSOPHY&single=Y").readlines.join)

	data.search("//td[@width='400']").to_a.collect {\|i\| i if !(i.search("a.bodyLink").first.nil?) }.compact.collect {\|i\|
	{
	:href => "http://search.ucas.com" + i.search("a.bodyLink").first.attributes['href'],
	:title => i.search("a.bodyLink")[0].content,
	:code => i.search("span.bodyTextSmallGrey").first.content.gsub(/\(([A-Za-z0-9]+)\)/, "\\1")
	}
	}

	# once you've got the HREF for each course, you load it up and extract any
	# data from it that would be useful to help find university courses.
	# eventually, I thought it'd be pretty neat to merge this data with the data
	# available on dbpedia/freebase etc.

	# sorry I can't attend. I really wish I could. I hope it goes well and will
	# be watching what goes on through twitter/blogs etc.

	# if nobody gets around to doing this, I will get around to writing a UCAS
	# parser - BUT not until next month.
	#
	# -tom