Created
March 17, 2012 14:15
-
-
Save isakb/2059670 to your computer and use it in GitHub Desktop.
A spider I wrote many years ago. Put here for archaeological purposes. Crawls web pages and creates word histograms to figure out most common words.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| ['rubygems', 'spider', 'hpricot', 'iconv'].each {|f| require f } | |
| class Converter | |
| def initialize(encoding_to="utf8") | |
| @to = encoding_to | |
| end | |
| def convert(str, encoding_from) | |
| i = Iconv.new(@to, encoding_from) | |
| c_str = "" | |
| begin | |
| c_str << i.iconv(str) | |
| rescue Exception => e | |
| c_str << e.success | |
| ch, str = e.failed.split(//, 2) | |
| c_str << "?" | |
| $stderr.puts "FAIL !! char:#{ch}" | |
| retry | |
| end | |
| return c_str | |
| end | |
| end | |
| class WordSpider | |
| class StopSpidering < RuntimeError; end | |
| def initialize(site = { | |
| :name => 'NAME', | |
| :start_url => 'http://www.***.se/', | |
| :domain => '***\.se', | |
| :page_limit => 10, | |
| :simulate_googlebot => true}, | |
| long_word=15 ) | |
| @site = site | |
| @utf = Converter.new("utf8") | |
| @word_dict = Hash.new(0) | |
| @longword_dict = Hash.new(0) | |
| @CHARS_IN_LONG_WORD = long_word | |
| end | |
| def run(page_limit=10) | |
| @time_started = Time.now.strftime("%Y%m%d-%H%M%S") | |
| count_pages = @site[:page_limit] | |
| puts "Starting" | |
| Thread.abort_on_exception = true | |
| begin | |
| Spider.start_at(@site[:start_url]) do |s| | |
| # se: http://spider.rubyforge.org/ | |
| s.setup do |a_url| | |
| if @site[:simulate_googlebot] | |
| s.headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | |
| end | |
| end | |
| if @site[:domain] | |
| s.add_url_check do |a_url| | |
| a_url =~ %r{^http://(.+?\.)?#{@site[:domain]}(/.*)?$} | |
| end | |
| end | |
| s.add_url_check do |a_url| | |
| not a_url =~ %r{\.(xml|rss|css|js|jpg|gif|png|swf|pdf|doc|exe|mp3)$} | |
| end | |
| s.on 400 do |a_url, resp, prior_url| | |
| puts "404 not found: #{a_url}, prior: #{prior_url}" | |
| end | |
| s.on :failure do |a_url, resp, prior_url| | |
| puts "URL failed: #{a_url}" | |
| puts " linked from #{prior_url}" | |
| end | |
| s.on :success do |a_url, resp, prior_url| | |
| puts "#{a_url}: #{resp.code}" | |
| str = resp.body | |
| encoding = str[/content="text\/html; charset=(.+?)"/, 1] | |
| if encoding.nil? | |
| puts "Unknown encoding at #{a_url}" | |
| else | |
| count_pages -= 1 | |
| puts "Processing content..." | |
| str = @utf.convert(str, encoding) | |
| # This fixes issues with <li>word</li><li>another word</li> joining: | |
| str.gsub!(/></, "> <") | |
| doc = Hpricot(str).search("body") | |
| doc.search("script|embed|object|style|link|head|pre").remove | |
| puts "Adding words to word histogram" | |
| i = 0 | |
| words = doc.inner_text | |
| words.downcase! | |
| # first replace all newlines etc that disturbs the text flow | |
| words.gsub!(/\s+/um, " ") | |
| # Remove words containing illegal characters, such as blabla@address.com etc. | |
| words.gsub!(/\b[^\s]*?[^[:alpha:]åöüïéáàô\s]+?[^\s]*?\b/um, " ") | |
| # Remove anything except word characters and spaces | |
| words.gsub!(/[^\s[:alpha:]åäöüïéáàô]+/um, " ") | |
| # Finally remove all repeated spaces again | |
| words.gsub!(/\s+/um, " ") | |
| words.split(" ").each { |w| | |
| next if w.length < 2 | |
| if w.length > @CHARS_IN_LONG_WORD | |
| @longword_dict[w] += 1 | |
| else | |
| @word_dict[w] += 1 | |
| end | |
| i += 1 | |
| } | |
| puts "Found #{i} words on page in all." | |
| puts "Word dict now contains #{@word_dict.length} unique words." | |
| puts "LongWord dict now contains #{@longword_dict.length} unique words." | |
| end | |
| puts "#{count_pages} pages left to visit..." | |
| puts | |
| save() if count_pages.modulo(10) == 0 | |
| sleep(0.1) | |
| raise StopSpidering if count_pages < 1 | |
| end | |
| end | |
| puts "No more links to follow." | |
| rescue StopSpidering | |
| puts "Aborting spidering due to limit reached." | |
| rescue => e | |
| if @site[:debug] | |
| raise | |
| else | |
| puts "Aborting due to exception:" | |
| p e | |
| end | |
| end | |
| puts "Sorting word histogram according to frequency and saving to file." | |
| # save to file | |
| save() | |
| end | |
| def save | |
| print "Saving wordhistogram... " | |
| f = File.open("#{@site[:name]}_#{@site[:page_limit]}_WordHistograms_#{@time_started}.txt", "w") | |
| begin | |
| @word_dict.sort {|a,b| b[1]<=>a[1]}.each { |t| | |
| f.puts "%s\t%s" % t | |
| } | |
| ensure | |
| f.close | |
| end | |
| print "Saving longwordhistogram..." | |
| f = File.open("#{@site[:name]}_#{@site[:page_limit]}_LongWordHistograms_#{@time_started}.txt", "w") | |
| begin | |
| @longword_dict.sort {|a,b| b[1]<=>a[1]}.each { |t| | |
| f.puts "%s\t%s" % t | |
| } | |
| ensure | |
| f.close | |
| end | |
| print "Done saving files.\n" | |
| end | |
| end | |
| # Example use: | |
| ws = WordSpider.new({ | |
| :name => 'Whatever', | |
| :start_url => 'http://you-web-page.com', | |
| :domain => '\.com', | |
| :page_limit => 100, | |
| :simulate_googlebot => true, | |
| :debug => true}) | |
| ws.run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment