Created
June 27, 2016 14:57
-
-
Save JoshAshby/c8905df74a98bdedc32b6d3992aa56b4 to your computer and use it in GitHub Desktop.
Mostly parses the HOPE 11 Schedule into a TSV, messes up on one or two rows with the "extra info"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'net/https' | |
require 'nokogiri' | |
uri = URI('https://hope.net/schedule.html') | |
http = Net::HTTP.new(uri.host, uri.port) | |
http.use_ssl = true | |
http.verify_mode = OpenSSL::SSL::VERIFY_PEER | |
req = Net::HTTP::Get.new(uri) | |
response = http.request(req) | |
body_giri = Nokogiri::HTML response.body | |
nodes = body_giri.xpath '//*[@id="pagecontent"]/article/*[self::h3|self::p]' | |
ScheduleBlock = Struct.new :day, :time, :room, :extra | |
current_section = nil | |
sections = nodes.inject( {} ) do |memo, node| | |
if node.name == 'h3' | |
memo[ current_section ] = memo[ current_section ].last unless current_section.nil? | |
current_section = node.text | |
memo[ current_section ] ||= [] | |
next memo | |
end | |
memo[ current_section ] << ScheduleBlock.new(*node.children.first.text.split(' ', 4)) | |
memo | |
end | |
sections[ current_section ] = sections[ current_section ].last | |
file = File.open 'hope_schedule.tsv', 'w' | |
sections.each do |title, block| | |
file.puts "#{ title }\t#{ block.day }\t#{ block.time }\t#{ block.room }\t#{ block.extra }" | |
end | |
file.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment