Skip to content

Instantly share code, notes, and snippets.

@isakb
Created March 17, 2012 14:15
Show Gist options
  • Select an option

  • Save isakb/2059670 to your computer and use it in GitHub Desktop.

Select an option

Save isakb/2059670 to your computer and use it in GitHub Desktop.
A spider I wrote many years ago. Put here for archaeological purposes. Crawls web pages and creates word histograms to figure out most common words.
#! /usr/bin/env ruby
['rubygems', 'spider', 'hpricot', 'iconv'].each {|f| require f }
class Converter
def initialize(encoding_to="utf8")
@to = encoding_to
end
def convert(str, encoding_from)
i = Iconv.new(@to, encoding_from)
c_str = ""
begin
c_str << i.iconv(str)
rescue Exception => e
c_str << e.success
ch, str = e.failed.split(//, 2)
c_str << "?"
$stderr.puts "FAIL !! char:#{ch}"
retry
end
return c_str
end
end
class WordSpider
class StopSpidering < RuntimeError; end
def initialize(site = {
:name => 'NAME',
:start_url => 'http://www.***.se/',
:domain => '***\.se',
:page_limit => 10,
:simulate_googlebot => true},
long_word=15 )
@site = site
@utf = Converter.new("utf8")
@word_dict = Hash.new(0)
@longword_dict = Hash.new(0)
@CHARS_IN_LONG_WORD = long_word
end
def run(page_limit=10)
@time_started = Time.now.strftime("%Y%m%d-%H%M%S")
count_pages = @site[:page_limit]
puts "Starting"
Thread.abort_on_exception = true
begin
Spider.start_at(@site[:start_url]) do |s|
# se: http://spider.rubyforge.org/
s.setup do |a_url|
if @site[:simulate_googlebot]
s.headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
end
end
if @site[:domain]
s.add_url_check do |a_url|
a_url =~ %r{^http://(.+?\.)?#{@site[:domain]}(/.*)?$}
end
end
s.add_url_check do |a_url|
not a_url =~ %r{\.(xml|rss|css|js|jpg|gif|png|swf|pdf|doc|exe|mp3)$}
end
s.on 400 do |a_url, resp, prior_url|
puts "404 not found: #{a_url}, prior: #{prior_url}"
end
s.on :failure do |a_url, resp, prior_url|
puts "URL failed: #{a_url}"
puts " linked from #{prior_url}"
end
s.on :success do |a_url, resp, prior_url|
puts "#{a_url}: #{resp.code}"
str = resp.body
encoding = str[/content="text\/html; charset=(.+?)"/, 1]
if encoding.nil?
puts "Unknown encoding at #{a_url}"
else
count_pages -= 1
puts "Processing content..."
str = @utf.convert(str, encoding)
# This fixes issues with <li>word</li><li>another word</li> joining:
str.gsub!(/></, "> <")
doc = Hpricot(str).search("body")
doc.search("script|embed|object|style|link|head|pre").remove
puts "Adding words to word histogram"
i = 0
words = doc.inner_text
words.downcase!
# first replace all newlines etc that disturbs the text flow
words.gsub!(/\s+/um, " ")
# Remove words containing illegal characters, such as blabla@address.com etc.
words.gsub!(/\b[^\s]*?[^[:alpha:]åöüïéáàô\s]+?[^\s]*?\b/um, " ")
# Remove anything except word characters and spaces
words.gsub!(/[^\s[:alpha:]åäöüïéáàô]+/um, " ")
# Finally remove all repeated spaces again
words.gsub!(/\s+/um, " ")
words.split(" ").each { |w|
next if w.length < 2
if w.length > @CHARS_IN_LONG_WORD
@longword_dict[w] += 1
else
@word_dict[w] += 1
end
i += 1
}
puts "Found #{i} words on page in all."
puts "Word dict now contains #{@word_dict.length} unique words."
puts "LongWord dict now contains #{@longword_dict.length} unique words."
end
puts "#{count_pages} pages left to visit..."
puts
save() if count_pages.modulo(10) == 0
sleep(0.1)
raise StopSpidering if count_pages < 1
end
end
puts "No more links to follow."
rescue StopSpidering
puts "Aborting spidering due to limit reached."
rescue => e
if @site[:debug]
raise
else
puts "Aborting due to exception:"
p e
end
end
puts "Sorting word histogram according to frequency and saving to file."
# save to file
save()
end
def save
print "Saving wordhistogram... "
f = File.open("#{@site[:name]}_#{@site[:page_limit]}_WordHistograms_#{@time_started}.txt", "w")
begin
@word_dict.sort {|a,b| b[1]<=>a[1]}.each { |t|
f.puts "%s\t%s" % t
}
ensure
f.close
end
print "Saving longwordhistogram..."
f = File.open("#{@site[:name]}_#{@site[:page_limit]}_LongWordHistograms_#{@time_started}.txt", "w")
begin
@longword_dict.sort {|a,b| b[1]<=>a[1]}.each { |t|
f.puts "%s\t%s" % t
}
ensure
f.close
end
print "Done saving files.\n"
end
end
# Example use:
ws = WordSpider.new({
:name => 'Whatever',
:start_url => 'http://you-web-page.com',
:domain => '\.com',
:page_limit => 100,
:simulate_googlebot => true,
:debug => true})
ws.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment