isakb · March 17, 2012 14:15
diff --git a/word-spider.rb b/word-spider.rb
 #! /usr/bin/env ruby
 ['rubygems', 'spider', 'hpricot', 'iconv'].each {|f| require f }
 class Converter
  def initialize(encoding_to="utf8")
    @to = encoding_to
  end
  def convert(str, encoding_from)
    i = Iconv.new(@to, encoding_from)
    c_str = ""
    begin
      c_str << i.iconv(str)
    rescue Exception => e
      c_str << e.success
      ch, str = e.failed.split(//, 2)
      c_str << "?"
      $stderr.puts "FAIL !! char:#{ch}"
      retry
    end
    return c_str
  end
 end

 class WordSpider

  class StopSpidering < RuntimeError; end

  def initialize(site = {
          :name => 'NAME',
          :start_url => 'http://www.***.se/',
          :domain => '***\.se',
          :page_limit => 10,
          :simulate_googlebot => true},
        long_word=15 )
    @site = site
    @utf = Converter.new("utf8")
    @word_dict = Hash.new(0)
    @longword_dict = Hash.new(0)
    @CHARS_IN_LONG_WORD = long_word
  end

  def run(page_limit=10)
    @time_started = Time.now.strftime("%Y%m%d-%H%M%S")
    count_pages = @site[:page_limit]
    puts "Starting"
    Thread.abort_on_exception = true
    begin
      Spider.start_at(@site[:start_url]) do |s|
      # se: http://spider.rubyforge.org/
        s.setup do |a_url|
          if @site[:simulate_googlebot]
            s.headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
          end
        end
        if @site[:domain]
          s.add_url_check do |a_url|
            a_url =~ %r{^http://(.+?\.)?#{@site[:domain]}(/.*)?$}
          end
        end
        s.add_url_check do |a_url|
          not a_url =~ %r{\.(xml|rss|css|js|jpg|gif|png|swf|pdf|doc|exe|mp3)$}
        end
        s.on 400 do |a_url, resp, prior_url|
          puts "404 not found: #{a_url}, prior: #{prior_url}"
        end
        s.on :failure do |a_url, resp, prior_url|
          puts "URL failed: #{a_url}"
          puts " linked from #{prior_url}"
        end
        s.on :success do |a_url, resp, prior_url|
          puts "#{a_url}: #{resp.code}"
          str = resp.body
          encoding = str[/content="text\/html; charset=(.+?)"/, 1]
          if encoding.nil?
            puts "Unknown encoding at #{a_url}"
          else
            count_pages -= 1
            puts "Processing content..."
            str = @utf.convert(str, encoding)
            # This fixes issues with <li>word</li><li>another word</li> joining:
            str.gsub!(/></, "> <")
            doc = Hpricot(str).search("body")
            doc.search("script|embed|object|style|link|head|pre").remove
            puts "Adding words to word histogram"
            i = 0
            words = doc.inner_text
            words.downcase!
            # first replace all newlines etc that disturbs the text flow
            words.gsub!(/\s+/um, " ")
            # Remove words containing illegal characters, such as blabla@address.com etc.
            words.gsub!(/\b[^\s]*?[^[:alpha:]åöüïéáàô\s]+?[^\s]*?\b/um, " ")
            # Remove anything except word characters and spaces
            words.gsub!(/[^\s[:alpha:]åäöüïéáàô]+/um, " ")
            # Finally remove all repeated spaces again
            words.gsub!(/\s+/um, " ")
            words.split(" ").each { |w|
              next if w.length < 2
              if w.length > @CHARS_IN_LONG_WORD
                @longword_dict[w] += 1
              else
                @word_dict[w] += 1
              end
              i += 1
            }
            puts "Found #{i} words on page in all."
            puts "Word dict now contains #{@word_dict.length} unique words."
            puts "LongWord dict now contains #{@longword_dict.length} unique words."
          end
          puts "#{count_pages} pages left to visit..."
          puts
          save() if count_pages.modulo(10) == 0
          sleep(0.1)
          raise StopSpidering if count_pages < 1
        end
      end
      puts "No more links to follow."
    rescue StopSpidering
      puts "Aborting spidering due to limit reached."
    rescue => e
      if @site[:debug]
        raise
      else
        puts "Aborting due to exception:"
        p e
      end
    end
    puts "Sorting word histogram according to frequency and saving to file."
    # save to file
    save()
  end

  def save
    print "Saving wordhistogram... "
    f = File.open("#{@site[:name]}_#{@site[:page_limit]}_WordHistograms_#{@time_started}.txt", "w")
    begin
      @word_dict.sort {|a,b| b[1]<=>a[1]}.each { |t|
        f.puts "%s\t%s" % t
      }
    ensure
      f.close
    end
    print "Saving longwordhistogram..."
    f = File.open("#{@site[:name]}_#{@site[:page_limit]}_LongWordHistograms_#{@time_started}.txt", "w")
    begin
      @longword_dict.sort {|a,b| b[1]<=>a[1]}.each { |t|
        f.puts "%s\t%s" % t
      }
    ensure
      f.close
    end
    print "Done saving files.\n"
  end

 end

 # Example use:
 ws = WordSpider.new({
          :name => 'Whatever',
          :start_url => 'http://you-web-page.com',
          :domain => '\.com',
          :page_limit => 100,
          :simulate_googlebot => true,
          :debug => true})
 ws.run
	#! /usr/bin/env ruby
	['rubygems', 'spider', 'hpricot', 'iconv'].each {\|f\| require f }
	class Converter
	def initialize(encoding_to="utf8")
	@to = encoding_to
	end
	def convert(str, encoding_from)
	i = Iconv.new(@to, encoding_from)
	c_str = ""
	begin
	c_str << i.iconv(str)
	rescue Exception => e
	c_str << e.success
	ch, str = e.failed.split(//, 2)
	c_str << "?"
	$stderr.puts "FAIL !! char:#{ch}"
	retry
	end
	return c_str
	end
	end

	class WordSpider

	class StopSpidering < RuntimeError; end

	def initialize(site = {
	:name => 'NAME',
	:start_url => 'http://www.***.se/',
	:domain => '***\.se',
	:page_limit => 10,
	:simulate_googlebot => true},
	long_word=15 )
	@site = site
	@utf = Converter.new("utf8")
	@word_dict = Hash.new(0)
	@longword_dict = Hash.new(0)
	@CHARS_IN_LONG_WORD = long_word
	end

	def run(page_limit=10)
	@time_started = Time.now.strftime("%Y%m%d-%H%M%S")
	count_pages = @site[:page_limit]
	puts "Starting"
	Thread.abort_on_exception = true
	begin
	Spider.start_at(@site[:start_url]) do \|s\|
	# se: http://spider.rubyforge.org/
	s.setup do \|a_url\|
	if @site[:simulate_googlebot]
	s.headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
	end
	end
	if @site[:domain]
	s.add_url_check do \|a_url\|
	a_url =~ %r{^http://(.+?\.)?#{@site[:domain]}(/.*)?$}
	end
	end
	s.add_url_check do \|a_url\|
	not a_url =~ %r{\.(xml\|rss\|css\|js\|jpg\|gif\|png\|swf\|pdf\|doc\|exe\|mp3)$}
	end
	s.on 400 do \|a_url, resp, prior_url\|
	puts "404 not found: #{a_url}, prior: #{prior_url}"
	end
	s.on :failure do \|a_url, resp, prior_url\|
	puts "URL failed: #{a_url}"
	puts " linked from #{prior_url}"
	end
	s.on :success do \|a_url, resp, prior_url\|
	puts "#{a_url}: #{resp.code}"
	str = resp.body
	encoding = str[/content="text\/html; charset=(.+?)"/, 1]
	if encoding.nil?
	puts "Unknown encoding at #{a_url}"
	else
	count_pages -= 1
	puts "Processing content..."
	str = @utf.convert(str, encoding)
	# This fixes issues with <li>word</li><li>another word</li> joining:
	str.gsub!(/></, "> <")
	doc = Hpricot(str).search("body")
	doc.search("script\|embed\|object\|style\|link\|head\|pre").remove
	puts "Adding words to word histogram"
	i = 0
	words = doc.inner_text
	words.downcase!
	# first replace all newlines etc that disturbs the text flow
	words.gsub!(/\s+/um, " ")
	# Remove words containing illegal characters, such as blabla@address.com etc.
	words.gsub!(/\b[^\s]?[^[:alpha:]åöüïéáàô\s]+?[^\s]?\b/um, " ")
	# Remove anything except word characters and spaces
	words.gsub!(/[^\s[:alpha:]åäöüïéáàô]+/um, " ")
	# Finally remove all repeated spaces again
	words.gsub!(/\s+/um, " ")
	words.split(" ").each { \|w\|
	next if w.length < 2
	if w.length > @CHARS_IN_LONG_WORD
	@longword_dict[w] += 1
	else
	@word_dict[w] += 1
	end
	i += 1
	}
	puts "Found #{i} words on page in all."
	puts "Word dict now contains #{@word_dict.length} unique words."
	puts "LongWord dict now contains #{@longword_dict.length} unique words."
	end
	puts "#{count_pages} pages left to visit..."
	puts
	save() if count_pages.modulo(10) == 0
	sleep(0.1)
	raise StopSpidering if count_pages < 1
	end
	end
	puts "No more links to follow."
	rescue StopSpidering
	puts "Aborting spidering due to limit reached."
	rescue => e
	if @site[:debug]
	raise
	else
	puts "Aborting due to exception:"
	p e
	end
	end
	puts "Sorting word histogram according to frequency and saving to file."
	# save to file
	save()
	end

	def save
	print "Saving wordhistogram... "
	f = File.open("#{@site[:name]}_#{@site[:page_limit]}_WordHistograms_#{@time_started}.txt", "w")
	begin
	@word_dict.sort {\|a,b\| b[1]<=>a[1]}.each { \|t\|
	f.puts "%s\t%s" % t
	}
	ensure
	f.close
	end
	print "Saving longwordhistogram..."
	f = File.open("#{@site[:name]}_#{@site[:page_limit]}_LongWordHistograms_#{@time_started}.txt", "w")
	begin
	@longword_dict.sort {\|a,b\| b[1]<=>a[1]}.each { \|t\|
	f.puts "%s\t%s" % t
	}
	ensure
	f.close
	end
	print "Done saving files.\n"
	end

	end

	# Example use:
	ws = WordSpider.new({
	:name => 'Whatever',
	:start_url => 'http://you-web-page.com',
	:domain => '\.com',
	:page_limit => 100,
	:simulate_googlebot => true,
	:debug => true})
	ws.run
No results found