wbzyl · January 11, 2012 21:09
diff --git a/percolated-twitter.rb b/percolated-twitter.rb
 # Reversed or “Real Time” Search in ElasticSearch
 # ====================================================================================

 # You may have come across the term “realtime search” lately
 # (eg. [here](http://engineering.socialcast.com/2011/05/realtime-search-solr-vs-elasticsearch/))
 # and wondered what all the fuss is about.
 #
 # Well, the usual workflow with search engines goes like this:
 #
 # 1. You index some documents.
 # 2. You perform a search query.
 #
 # In [_ElasticSearch_](http://www.elasticsearch.org/), there's a default delay
 # of 1 second between indexing a document and being able to search it.
 # You can, of course, force refresh the index. That's enough “realtime”
 # in my book, if you ask me.
 #
 # However, _ElasticSearch_ comes with a much more powerful idea for “realtime seach”,
 # called [_percolation_](http://www.elasticsearch.org/guide/reference/api/percolate.html).
 #
 # It _reverses_ the usual workflow like this:
 #
 # 1. You register some queries you'd like to perform in real time, on demand,
 # or _while the documents are being indexed_.
 # 2. You index some documents.
 # 
 # The search engine will match your documents against your queries, and return names of matching queries
 # in the response. Yes, _names_.
 #
 # This allows you to do crazy stuff immediately, _while your documents are being indexed_.
 # Think [Google Alerts](http://www.google.com/alerts) for _your_ data.
 #
 # The _Tire_ gem for _ElasticSearch_ recently got
 # [percolation support]((http://karmi.github.com/tire/#section-Percolation)).
 #
 # In this script, we'll register couple of queries, and then fetch data from _Twitter_,
 # receiving notifications when any status message matches one of our queries.
 #
 # You can download this script at <https://gist.github.com/1025498>.
 #

 # First of all, let's install some gems:
 #
 #     gem install tire ansi yajl-ruby
 #
 # Note that you need the 0.1.11 version of the `tire` gem.
 #
 require 'rubygems'
 require 'time'
 require 'uri'
 require 'tire'
 require 'ansi/code'
 require 'yajl/http_stream'

 include ANSI::Code

 # Let's define a class to hold our data in _ElasticSearch_.
 #
 # We're using the _persistence_ mode of _Tire_, so we don't need a database, because 
 # [_the index is our database_](http://www.slideshare.net/karmi/your-data-your-search-elasticsearch-euruko-2011/49).
 #
 class Status
  include Tire::Model::Persistence

  property :user
  property :text
  property :created_at

  # Let's define callback for percolation.
  # Whenewer a new document is saved in the index, this block will be executed,
  # and we will have access to matching queries in the `Status#matches` property.
  #
  # In our case, we will just print the list of matching queries.
  #
  on_percolate do
    puts green { "'#{text}' from @#{user} matches queries: #{matches.inspect}" } unless matches.empty?
  end
 end

 # Let's register the queries for percolation now.
 #
 # First, let's define the query_string queries.
 #
 q            = {}
 q[:newspeak] = 'wow omg lol wtf fuu*'
 q[:fail]     = 'fail'
 q[:memes]    = '"why u no" "all your base" "i can has"'
 #
 # Second, let's save those queries in _ElasticSearch_.
 #
 Status.index.register_percolator_query('newspeak') { |query| query.string q[:newspeak] }
 Status.index.register_percolator_query('fails')    { |query| query.string q[:fail] }
 Status.index.register_percolator_query('memes')    { |query| query.string q[:memes] }

 puts "", bold { "Testing percolation" }, '-'*80

 # Let's check out the percolation on some “example data”.
 #
 status = Status.new :text => 'OMG i can has #fail'
 puts "'#{status.text}' matches queries #{status.percolate.inspect}"

 # Now, let's fetch some real data from the collective consciousness.
 #
 url   = URI.parse("http://api.twitter.com/1/statuses/public_timeline.json")
 puts "", bold { "Fetching '#{url}'" }, '-'*80

 5.times do |i|

  # Get JSON data from _Twitter_.
  Yajl::HttpStream.get(url, :symbolize_keys => true) do |timeline|
    timeline.each do |status|

      # Create new document from each status message.
      #
      # Watch the output from the `on_percolate` callback in your console.
      # You may have to run this file repeatedly, in case _Twitter_ gets quiet.
      #
      Status.create :id => status[:id],
                    :user => status[:user][:screen_name],
                    :text => status[:text],
                    :created_at => Time.parse(status[:created_at])
    end

    puts "Indexed #{timeline.size} tweets"
  end
  sleep 10 if i < 4
 end

 puts "", bold { "Check out your index" }, '-'*80

 # You can check out the the documents in your index with `curl` or your browser.
 #
 puts "curl 'http://localhost:9200/statuses/_search?q=*&sort=created_at:desc&size=5&pretty=true'", ""
	# Reversed or “Real Time” Search in ElasticSearch
	# ====================================================================================

	# You may have come across the term “realtime search” lately
	# (eg. [here](http://engineering.socialcast.com/2011/05/realtime-search-solr-vs-elasticsearch/))
	# and wondered what all the fuss is about.
	#
	# Well, the usual workflow with search engines goes like this:
	#
	# 1. You index some documents.
	# 2. You perform a search query.
	#
	# In [_ElasticSearch_](http://www.elasticsearch.org/), there's a default delay
	# of 1 second between indexing a document and being able to search it.
	# You can, of course, force refresh the index. That's enough “realtime”
	# in my book, if you ask me.
	#
	# However, _ElasticSearch_ comes with a much more powerful idea for “realtime seach”,
	# called [_percolation_](http://www.elasticsearch.org/guide/reference/api/percolate.html).
	#
	# It _reverses_ the usual workflow like this:
	#
	# 1. You register some queries you'd like to perform in real time, on demand,
	# or _while the documents are being indexed_.
	# 2. You index some documents.
	#
	# The search engine will match your documents against your queries, and return names of matching queries
	# in the response. Yes, _names_.
	#
	# This allows you to do crazy stuff immediately, _while your documents are being indexed_.
	# Think [Google Alerts](http://www.google.com/alerts) for _your_ data.
	#
	# The _Tire_ gem for _ElasticSearch_ recently got
	# [percolation support]((http://karmi.github.com/tire/#section-Percolation)).
	#
	# In this script, we'll register couple of queries, and then fetch data from _Twitter_,
	# receiving notifications when any status message matches one of our queries.
	#
	# You can download this script at <https://gist.github.com/1025498>.
	#

	# First of all, let's install some gems:
	#
	# gem install tire ansi yajl-ruby
	#
	# Note that you need the 0.1.11 version of the `tire` gem.
	#
	require 'rubygems'
	require 'time'
	require 'uri'
	require 'tire'
	require 'ansi/code'
	require 'yajl/http_stream'

	include ANSI::Code

	# Let's define a class to hold our data in _ElasticSearch_.
	#
	# We're using the _persistence_ mode of _Tire_, so we don't need a database, because
	# [_the index is our database_](http://www.slideshare.net/karmi/your-data-your-search-elasticsearch-euruko-2011/49).
	#
	class Status
	include Tire::Model::Persistence

	property :user
	property :text
	property :created_at

	# Let's define callback for percolation.
	# Whenewer a new document is saved in the index, this block will be executed,
	# and we will have access to matching queries in the `Status#matches` property.
	#
	# In our case, we will just print the list of matching queries.
	#
	on_percolate do
	puts green { "'#{text}' from @#{user} matches queries: #{matches.inspect}" } unless matches.empty?
	end
	end

	# Let's register the queries for percolation now.
	#
	# First, let's define the query_string queries.
	#
	q = {}
	q[:newspeak] = 'wow omg lol wtf fuu*'
	q[:fail] = 'fail'
	q[:memes] = '"why u no" "all your base" "i can has"'
	#
	# Second, let's save those queries in _ElasticSearch_.
	#
	Status.index.register_percolator_query('newspeak') { \|query\| query.string q[:newspeak] }
	Status.index.register_percolator_query('fails') { \|query\| query.string q[:fail] }
	Status.index.register_percolator_query('memes') { \|query\| query.string q[:memes] }

	puts "", bold { "Testing percolation" }, '-'*80

	# Let's check out the percolation on some “example data”.
	#
	status = Status.new :text => 'OMG i can has #fail'
	puts "'#{status.text}' matches queries #{status.percolate.inspect}"

	# Now, let's fetch some real data from the collective consciousness.
	#
	url = URI.parse("http://api.twitter.com/1/statuses/public_timeline.json")
	puts "", bold { "Fetching '#{url}'" }, '-'*80

	5.times do \|i\|

	# Get JSON data from _Twitter_.
	Yajl::HttpStream.get(url, :symbolize_keys => true) do \|timeline\|
	timeline.each do \|status\|

	# Create new document from each status message.
	#
	# Watch the output from the `on_percolate` callback in your console.
	# You may have to run this file repeatedly, in case _Twitter_ gets quiet.
	#
	Status.create :id => status[:id],
	:user => status[:user][:screen_name],
	:text => status[:text],
	:created_at => Time.parse(status[:created_at])
	end

	puts "Indexed #{timeline.size} tweets"
	end
	sleep 10 if i < 4
	end

	puts "", bold { "Check out your index" }, '-'*80

	# You can check out the the documents in your index with `curl` or your browser.
	#
	puts "curl 'http://localhost:9200/statuses/_search?q=*&sort=created_at:desc&size=5&pretty=true'", ""
No results found