Skip to content

Instantly share code, notes, and snippets.

@rb2k
Created May 5, 2010 17:31
Show Gist options
  • Save rb2k/391134 to your computer and use it in GitHub Desktop.
Save rb2k/391134 to your computer and use it in GitHub Desktop.
require "nokogiri"
require "hpricot"
require "open-uri"
#on my macbook
#$ ruby speedtest.rb
#uri took: 4.539837
#hpricot took: 3.490182
#nokogiri took: 6.273096
#on EC2 small:
#uri took: 11.428387987
#hpricot took: 8.071451074000001
#nokogiri took: 12.722893389
puts "getting data"
html_string = open("http://www.reddit.com/").readlines.join("\n")
puts "done"
def extract_links_noko_css(html)
new_links = Array.new
Nokogiri::HTML(html).css("a").each do |link|
link_to = link.attributes["href"].content rescue nil
new_links << link_to
end
new_links.uniq
end
def extract_links_noko_css_nocontent(html)
new_links = Array.new
Nokogiri::HTML(html).css("a").each do |link|
link_to = link.attributes["href"] rescue nil
new_links << link_to
end
new_links.uniq
end
def extract_links_noko_xpath(html)
new_links = Array.new
Nokogiri::HTML(html).search("//a[@href]").each do |link|
link_to = link["href"]
new_links << link_to
end
new_links.uniq
end
def extract_links_noko_xpath_nohref(html)
new_links = Array.new
Nokogiri::HTML(html).search("//a").each do |link|
link_to = link["href"] rescue nil
new_links << link_to
end
new_links.uniq
end
def extract_links_hpricot(html)
new_links = Array.new
Hpricot(html).search("//a[@href]").each do |link|
link_to = link["href"]
new_links << link_to
end
new_links.uniq
end
def extract_links_hpricot_css(html)
new_links = Array.new
Hpricot(html).search("a").each do |link|
link_to = link.attributes["href"].content rescue nil
new_links << link_to
end
new_links.uniq
end
def extract_links_hpricot_nohref(html)
new_links = Array.new
Hpricot(html).search("//a").each do |link|
link_to = link["href"]
new_links << link_to
end
new_links.uniq
end
start = Time.now
250.times do
bla = extract_links_hpricot(html_string)
end
took = Time.now - start
puts "hpricot (xpath) took: #{took}"
start = Time.now
250.times do
bla = extract_links_hpricot_nohref(html_string)
end
took = Time.now - start
puts "hpricot (xpath, no href) took: #{took}"
start = Time.now
250.times do
bla = extract_links_hpricot_css(html_string)
end
took = Time.now - start
puts "hpricot (css) took: #{took}"
start = Time.now
250.times do
bla = extract_links_noko_xpath(html_string)
end
took = Time.now - start
puts "nokogiri (xpath) took: #{took}"
start = Time.now
250.times do
bla = extract_links_noko_css_nocontent(html_string)
end
took = Time.now - start
puts "nokogiri (css_nocontent) took: #{took}"
start = Time.now
250.times do
bla = extract_links_noko_css(html_string)
end
took = Time.now - start
puts "nokogiri (css) took: #{took}"
start = Time.now
250.times do
bla = extract_links_noko_xpath_nohref(html_string)
end
took = Time.now - start
puts "nokogiri (xpath no href) took: #{took}"
hpricot (xpath) took: 3.599083
hpricot (xpath, no href) took: 3.283622
hpricot (css) took: 4.996853
nokogiri (xpath) took: 4.169071
nokogiri (css_nocontent) took: 4.372877
nokogiri (css) took: 4.494918
nokogiri (xpath no href) took: 3.861592
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment