Created
April 22, 2013 09:24
-
-
Save alanho/5433521 to your computer and use it in GitHub Desktop.
Extract voting result JPEGs/PDFs from LegCo website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "nokogiri" | |
require "open-uri" | |
require "uri" | |
urls = [ | |
"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0809.htm", | |
"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0910.htm", | |
"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1011.htm", | |
"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1112.htm", | |
"http://www.legco.gov.hk/general/chinese/counmtg/yr12-16/mtg_1213.htm" | |
] | |
STACK = [] | |
def process_page(url) | |
return if STACK.include? url | |
STACK << url | |
doc = Nokogiri::HTML(open(url)) | |
doc.search("a").each do |link| | |
uri = URI.parse(url) | |
if link["href"] =~ /\/voting\// | |
if link["href"] =~ /\.pdf$/ | |
uri += link["href"] | |
print "PDF: #{uri.to_s}\n" | |
elsif link["href"] =~ /^https?:\/\// | |
process_page link["href"] | |
else | |
uri += link["href"] | |
process_page uri.to_s | |
end | |
end | |
end | |
doc.search("img").each do |img| | |
uri = URI.parse(url) | |
if img["src"] =~ /\/voting\// | |
uri += img["src"] | |
print "IMG: #{uri.to_s}\n" | |
end | |
end | |
end | |
urls.each do |url| | |
process_page url | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment