alanho · April 22, 2013 09:24
diff --git a/legco_extract.rb b/legco_extract.rb
 require "nokogiri"
 require "open-uri"
 require "uri"

 urls = [
 "http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0809.htm",
 "http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0910.htm",
 "http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1011.htm",
 "http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1112.htm",
 "http://www.legco.gov.hk/general/chinese/counmtg/yr12-16/mtg_1213.htm"
 ]

 STACK = []

 def process_page(url)
  return if STACK.include? url
  STACK << url

  doc = Nokogiri::HTML(open(url))

  doc.search("a").each do |link|
    uri = URI.parse(url)
    if link["href"] =~ /\/voting\//
      if link["href"] =~ /\.pdf$/
        uri += link["href"]
        print "PDF: #{uri.to_s}\n"
      elsif link["href"] =~ /^https?:\/\//
        process_page link["href"]
      else
        uri += link["href"]
        process_page uri.to_s
      end
    end
  end

  doc.search("img").each do |img|
    uri = URI.parse(url) 

    if img["src"] =~ /\/voting\//
      uri += img["src"]
      print "IMG: #{uri.to_s}\n"
    end
  end
 end


 urls.each do |url|
  process_page url
 end
	require "nokogiri"
	require "open-uri"
	require "uri"

	urls = [
	"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0809.htm",
	"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_0910.htm",
	"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1011.htm",
	"http://www.legco.gov.hk/general/chinese/counmtg/yr08-12/mtg_1112.htm",
	"http://www.legco.gov.hk/general/chinese/counmtg/yr12-16/mtg_1213.htm"
	]

	STACK = []

	def process_page(url)
	return if STACK.include? url
	STACK << url

	doc = Nokogiri::HTML(open(url))

	doc.search("a").each do \|link\|
	uri = URI.parse(url)
	if link["href"] =~ /\/voting\//
	if link["href"] =~ /\.pdf$/
	uri += link["href"]
	print "PDF: #{uri.to_s}\n"
	elsif link["href"] =~ /^https?:\/\//
	process_page link["href"]
	else
	uri += link["href"]
	process_page uri.to_s
	end
	end
	end

	doc.search("img").each do \|img\|
	uri = URI.parse(url)

	if img["src"] =~ /\/voting\//
	uri += img["src"]
	print "IMG: #{uri.to_s}\n"
	end
	end
	end


	urls.each do \|url\|
	process_page url
	end