blech75 · March 7, 2013 22:03
diff --git a/page_header_report.rb b/page_header_report.rb
 #!/usr/bin/env ruby

 require 'rubygems'
 require 'nokogiri'
 require 'open-uri'
 require 'timeout' # to catch HTTP timeouts

 INPUT_FILENAME = "urls.html"

 HTTP_TIMEOUT = 5 # seconds

 # selector for pulling out links. should just be "a"
 LINK_SELECTOR = "a"

 # FIXME: this should be passed on the cmd line
 CSS_SELECTOR = "#site-name strong a span"

 OUTPUT_FILE = "urls.#{}.tsv"
 OUTPUT_COLS = ["anchor_text", "url", "matched_text"]

 # array of objects for later formatting/output
 results = []

 # ----


 def match_selector(page, css_selector)
  title_nodes = page.css(css_selector)

  if title_nodes.length > 0
    title_nodes[0].text
  else
    "ERROR: Couldn't parse CSS selector."
  end
 end


 def parse_doc(markup)
  match_selector(Nokogiri::HTML(markup), CSS_SELECTOR)
 end


 def match_text(url)
  begin
    remote_doc = Timeout::timeout(HTTP_TIMEOUT){
      open(url)
    }
    parse_doc(remote_doc)

  rescue OpenURI::HTTPError => e
    # "ERROR: #{e.io.status}"

    # sometimes we want to parse the 404 error page in case it the error is 
    # contextual info
    parse_doc(e.io.read)

  rescue Timeout::Error
    "ERROR: Timeout"

  rescue
    # FIXME: provide better error mesaging here
    "ERROR: Unknown"
  end
 end


 def output_results(results)
  tsv_content = OUTPUT_COLS.join("\n")
  tsv_content += results.map{ |row| "#{row[:anchor_text]}\t#{row[:url]}\t#{row[:matched_text]}" }
  File.open(OUTPUT_FILE, 'w') { |f| f.write(tsv_content.join("\n")) }
 end


 # FIXME: these three statements can probably be refactored into a block

 # read in the HTML
 input_file = File.open(INPUT_FILENAME)

 # use Nokogiti to parse the HTML
 url_doc = Nokogiri::HTML(input_file)

 # close the input file; we don't need it anymore
 input_file.close


 # pull the anchors out of the HTML doc
 anchors = url_doc.css(LINK_SELECTOR)
 puts "Checking #{anchors.length} URLs..."

 # iterate over the anchors
 anchors.each_with_index do |anchor, i|
  # grab the URL from the A tag and output it for status msg
  url = anchor.attributes['href'].text
  puts "[#{i}/#{anchors.length}] #{url}"

  # perform the match on the page
  matched_text = match_text(url)

  # append the data to the results array
  results << {
    :anchor_text => anchor.text,
    :url => url,
    :matched_text => matched_text
  }
 end


 # output the data
 output_results(results)
	#!/usr/bin/env ruby

	require 'rubygems'
	require 'nokogiri'
	require 'open-uri'
	require 'timeout' # to catch HTTP timeouts

	INPUT_FILENAME = "urls.html"

	HTTP_TIMEOUT = 5 # seconds

	# selector for pulling out links. should just be "a"
	LINK_SELECTOR = "a"

	# FIXME: this should be passed on the cmd line
	CSS_SELECTOR = "#site-name strong a span"

	OUTPUT_FILE = "urls.#{}.tsv"
	OUTPUT_COLS = ["anchor_text", "url", "matched_text"]

	# array of objects for later formatting/output
	results = []

	# ----


	def match_selector(page, css_selector)
	title_nodes = page.css(css_selector)

	if title_nodes.length > 0
	title_nodes[0].text
	else
	"ERROR: Couldn't parse CSS selector."
	end
	end


	def parse_doc(markup)
	match_selector(Nokogiri::HTML(markup), CSS_SELECTOR)
	end


	def match_text(url)
	begin
	remote_doc = Timeout::timeout(HTTP_TIMEOUT){
	open(url)
	}
	parse_doc(remote_doc)

	rescue OpenURI::HTTPError => e
	# "ERROR: #{e.io.status}"

	# sometimes we want to parse the 404 error page in case it the error is
	# contextual info
	parse_doc(e.io.read)

	rescue Timeout::Error
	"ERROR: Timeout"

	rescue
	# FIXME: provide better error mesaging here
	"ERROR: Unknown"
	end
	end


	def output_results(results)
	tsv_content = OUTPUT_COLS.join("\n")
	tsv_content += results.map{ \|row\| "#{row[:anchor_text]}\t#{row[:url]}\t#{row[:matched_text]}" }
	File.open(OUTPUT_FILE, 'w') { \|f\| f.write(tsv_content.join("\n")) }
	end


	# FIXME: these three statements can probably be refactored into a block

	# read in the HTML
	input_file = File.open(INPUT_FILENAME)

	# use Nokogiti to parse the HTML
	url_doc = Nokogiri::HTML(input_file)

	# close the input file; we don't need it anymore
	input_file.close


	# pull the anchors out of the HTML doc
	anchors = url_doc.css(LINK_SELECTOR)
	puts "Checking #{anchors.length} URLs..."

	# iterate over the anchors
	anchors.each_with_index do \|anchor, i\|
	# grab the URL from the A tag and output it for status msg
	url = anchor.attributes['href'].text
	puts "[#{i}/#{anchors.length}] #{url}"

	# perform the match on the page
	matched_text = match_text(url)

	# append the data to the results array
	results << {
	:anchor_text => anchor.text,
	:url => url,
	:matched_text => matched_text
	}
	end


	# output the data
	output_results(results)