Created
March 7, 2013 22:03
given markup containing a bunch of 'A' tags, fetch those URLs and extract info from them. (currently hardcoded; needs to be refactored a little bit.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'timeout' # to catch HTTP timeouts | |
INPUT_FILENAME = "urls.html" | |
HTTP_TIMEOUT = 5 # seconds | |
# selector for pulling out links. should just be "a" | |
LINK_SELECTOR = "a" | |
# FIXME: this should be passed on the cmd line | |
CSS_SELECTOR = "#site-name strong a span" | |
OUTPUT_FILE = "urls.#{}.tsv" | |
OUTPUT_COLS = ["anchor_text", "url", "matched_text"] | |
# array of objects for later formatting/output | |
results = [] | |
# ---- | |
def match_selector(page, css_selector) | |
title_nodes = page.css(css_selector) | |
if title_nodes.length > 0 | |
title_nodes[0].text | |
else | |
"ERROR: Couldn't parse CSS selector." | |
end | |
end | |
def parse_doc(markup) | |
match_selector(Nokogiri::HTML(markup), CSS_SELECTOR) | |
end | |
def match_text(url) | |
begin | |
remote_doc = Timeout::timeout(HTTP_TIMEOUT){ | |
open(url) | |
} | |
parse_doc(remote_doc) | |
rescue OpenURI::HTTPError => e | |
# "ERROR: #{e.io.status}" | |
# sometimes we want to parse the 404 error page in case it the error is | |
# contextual info | |
parse_doc(e.io.read) | |
rescue Timeout::Error | |
"ERROR: Timeout" | |
rescue | |
# FIXME: provide better error mesaging here | |
"ERROR: Unknown" | |
end | |
end | |
def output_results(results) | |
tsv_content = OUTPUT_COLS.join("\n") | |
tsv_content += results.map{ |row| "#{row[:anchor_text]}\t#{row[:url]}\t#{row[:matched_text]}" } | |
File.open(OUTPUT_FILE, 'w') { |f| f.write(tsv_content.join("\n")) } | |
end | |
# FIXME: these three statements can probably be refactored into a block | |
# read in the HTML | |
input_file = File.open(INPUT_FILENAME) | |
# use Nokogiti to parse the HTML | |
url_doc = Nokogiri::HTML(input_file) | |
# close the input file; we don't need it anymore | |
input_file.close | |
# pull the anchors out of the HTML doc | |
anchors = url_doc.css(LINK_SELECTOR) | |
puts "Checking #{anchors.length} URLs..." | |
# iterate over the anchors | |
anchors.each_with_index do |anchor, i| | |
# grab the URL from the A tag and output it for status msg | |
url = anchor.attributes['href'].text | |
puts "[#{i}/#{anchors.length}] #{url}" | |
# perform the match on the page | |
matched_text = match_text(url) | |
# append the data to the results array | |
results << { | |
:anchor_text => anchor.text, | |
:url => url, | |
:matched_text => matched_text | |
} | |
end | |
# output the data | |
output_results(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment