jsooriah · May 3, 2011 17:03
diff --git a/elasticoverflow.rb b/elasticoverflow.rb
 # =======================================================
 # Importing and searching RSS with ElasticSearch and Tire
 # =======================================================
 #
 # This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch
 # via the [Tire](https://github.com/karmi/tire) Rubygem.
 #
 # Requirements
 # ------------
 #
 # * Sun Java 6 (for ElasticSearch)
 # * Ruby >= 1.8.7
 # * Rubygems >= 1.5.0
 #
 # Usage
 # -----
 #
 #     ruby elasticoverflow.rb
 #

 require 'rubygems'
 require 'open-uri'
 require 'benchmark'

 # Check for required Rubygems, exit otherwise
 #
 %w[ tire nokogiri ].each do |lib|
  begin
    require lib
  rescue LoadError
    STDERR.puts "[ERROR] Required library '#{lib}' missing.", "        Please install it with:", "        $ gem install #{lib}", "\n"
    exit(1)
  end
 end

 # Check if ElasticSearch is running on this machine, exit otherwise
 #
 ( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

 [ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:
 
         curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
         tar -zxvf elasticsearch-0.16.0.tar.gz
         ./elasticsearch-0.16.0/bin/elasticsearch -f
 INSTALL

 URL = 'http://stackoverflow.com/feeds'

 puts "", "Fetching data from '#{URL}'...", "-"*80

 # Parse the Stackoverflow RSS
 #
 feed = Nokogiri::HTML(open(URL))

 # Prepare the documents
 #
 documents = feed.search("//entry").map do |entry|
  result              = {}
  result[:type]       = 'question'
  result[:id]         = entry.xpath("id").text[/questions\/(\d+)\//, 1]
  result[:title]      = entry.xpath("title").text
  result[:link]       = entry.xpath("link[@rel='alternate']/@href").text
  result[:categories] = entry.xpath("category/@term").map { |c| c.to_s }
  result[:author]     = entry.xpath("author/name").text
  result[:published]  = entry.xpath("published").text
  result[:summary]    = entry.xpath("summary").text

  result
 end

 puts "", "Importing these #{documents.size} documents:", "-"*80

 documents.each { |document| puts "* #{document[:title]}" }

 elapsed = Benchmark.realtime do

  Tire.index 'stackoverflow' do

    # Create the index with proper mapping (if not exists already)
    #
    create :mappings => {
      :question => {
        :properties => {
          :id         => { :type => 'string', :analyzer => 'keyword' },
          :link       => { :type => 'string', :analyzer => 'keyword' },
          :categories => { :type => 'string', :analyzer => 'keyword' },
          :author     => { :type => 'string', :analyzer => 'keyword' },
          :title      => { :type => 'string', :analyzer => 'snowball' },
          :summary    => { :type => 'string', :analyzer => 'snowball' }
        }
      }
    }

    # Import documents
    import documents

    # Refresh the index for immediate searching
    #
    refresh
  end

 end

 puts "-"*80, "Importing took #{(elapsed*1000).to_i} milliseconds"

 puts "", "Searching...", "-"*80

 s = Tire.search('stackoverflow') do

  # Search for questions containing ‘ruby’
  #
  query { string 'ruby' }

  # Retrieve aggregated counts for top ten categories
  #
  facet('categories') { terms :categories, :global => true }
 end

 puts "Search took #{s.results.time} milliseconds."

 puts "", "Any questions about ruby?", "-"*80

 s.results.each do |d|
  puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
 end

 puts "", "Top 10 categories in database:", "-"*80

 s.results.facets['categories']['terms'].each do |f|
  puts "#{f['term'].ljust(15)} #{f['count']}"
 end

 puts "", "Or, try the search with curl:", "-"*80
 puts s.to_curl
	# =======================================================
	# Importing and searching RSS with ElasticSearch and Tire
	# =======================================================
	#
	# This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch
	# via the [Tire](https://github.com/karmi/tire) Rubygem.
	#
	# Requirements
	# ------------
	#
	# * Sun Java 6 (for ElasticSearch)
	# * Ruby >= 1.8.7
	# * Rubygems >= 1.5.0
	#
	# Usage
	# -----
	#
	# ruby elasticoverflow.rb
	#

	require 'rubygems'
	require 'open-uri'
	require 'benchmark'

	# Check for required Rubygems, exit otherwise
	#
	%w[ tire nokogiri ].each do \|lib\|
	begin
	require lib
	rescue LoadError
	STDERR.puts "[ERROR] Required library '#{lib}' missing.", " Please install it with:", " $ gem install #{lib}", "\n"
	exit(1)
	end
	end

	# Check if ElasticSearch is running on this machine, exit otherwise
	#
	( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false)

	[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands:

	curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz
	tar -zxvf elasticsearch-0.16.0.tar.gz
	./elasticsearch-0.16.0/bin/elasticsearch -f
	INSTALL

	URL = 'http://stackoverflow.com/feeds'

	puts "", "Fetching data from '#{URL}'...", "-"*80

	# Parse the Stackoverflow RSS
	#
	feed = Nokogiri::HTML(open(URL))

	# Prepare the documents
	#
	documents = feed.search("//entry").map do \|entry\|
	result = {}
	result[:type] = 'question'
	result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1]
	result[:title] = entry.xpath("title").text
	result[:link] = entry.xpath("link[@rel='alternate']/@href").text
	result[:categories] = entry.xpath("category/@term").map { \|c\| c.to_s }
	result[:author] = entry.xpath("author/name").text
	result[:published] = entry.xpath("published").text
	result[:summary] = entry.xpath("summary").text

	result
	end

	puts "", "Importing these #{documents.size} documents:", "-"*80

	documents.each { \|document\| puts "* #{document[:title]}" }

	elapsed = Benchmark.realtime do

	Tire.index 'stackoverflow' do

	# Create the index with proper mapping (if not exists already)
	#
	create :mappings => {
	:question => {
	:properties => {
	:id => { :type => 'string', :analyzer => 'keyword' },
	:link => { :type => 'string', :analyzer => 'keyword' },
	:categories => { :type => 'string', :analyzer => 'keyword' },
	:author => { :type => 'string', :analyzer => 'keyword' },
	:title => { :type => 'string', :analyzer => 'snowball' },
	:summary => { :type => 'string', :analyzer => 'snowball' }
	}
	}
	}

	# Import documents
	import documents

	# Refresh the index for immediate searching
	#
	refresh
	end

	end

	puts "-"80, "Importing took #{(elapsed1000).to_i} milliseconds"

	puts "", "Searching...", "-"*80

	s = Tire.search('stackoverflow') do

	# Search for questions containing ‘ruby’
	#
	query { string 'ruby' }

	# Retrieve aggregated counts for top ten categories
	#
	facet('categories') { terms :categories, :global => true }
	end

	puts "Search took #{s.results.time} milliseconds."

	puts "", "Any questions about ruby?", "-"*80

	s.results.each do \|d\|
	puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]"
	end

	puts "", "Top 10 categories in database:", "-"*80

	s.results.facets['categories']['terms'].each do \|f\|
	puts "#{f['term'].ljust(15)} #{f['count']}"
	end

	puts "", "Or, try the search with curl:", "-"*80
	puts s.to_curl
No results found