karmi · September 12, 2011 15:32
diff --git a/facebook-messages-search.rb b/facebook-messages-search.rb
 # --------------------------------------------------------------
 # Simplified model of Facebook's Message Inbox Search with HBase
 # --------------------------------------------------------------
 #
 # Facebook exploits versioning support in HBase with a very interesting twist:
 # it stores message IDs for given token as “custom timestamps” in the database.
 #
 # The [HBase: The Definitive Guide](http://ofps.oreilly.com/titles/9781449396107/advanced.html#advsearch) book says (p. 385):
 #
 # > A prominent implementation of a client managed solution is the Facebook inbox search. The schema is built roughly like this:
 # > 
 # > * Every row is a single inbox, i.e., every user has a single row in the search table,
 # > 
 # > * the columns are the terms indexed from the messages,
 # > 
 # > * the versions are the message IDs,
 # > 
 # > * the values contain additional information, such as the position of the term in the document.
 #
 # See also the [Facebook Messages & HBase](http://www.slideshare.net/brizzzdotcom/facebook-messages-hbase/14) presentation.
 #
 # Run the example with:
 #
 #     $ hbase shell facebook-messages-search.rb
 #
 # --------------------------------------------------------------

 # First, some auxiliary infrastructure:

 # 1) Let's define some stopwords for the tokenization process.
 #
 STOPWORDS = %w|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with|

 # 2) Let's define a method to create tokens from the text stream.
 #
 def tokenize content
  content.split(/\W/).
  map    { |word| word.downcase }.
  reject { |word| STOPWORDS.include?(word) || word == ''  }
 end

 # 3) Let's define a method to search user's messages for given words.
 #
 def search words
  columns = tokenize(words).map { |t| "index:#{t}" }
  puts "Let's search for words #{tokenize(words).map { |t| "'#{t}'" }.join(', ')}:"
  puts ">  get 'messages', 'mary', { COLUMNS => #{columns.inspect}, VERSIONS => 10 }", ""

  get 'messages', 'mary', { COLUMNS => columns, VERSIONS => 10 }
 end

 # Now, let's add some data.

 # Create the table to hold the index for messages. Every user has one row in the table.
 #
 disable 'messages'
 drop    'messages'
 create  'messages', {NAME => 'index', VERSIONS => 1000}

 # Mary receives a message...
 #
 message = {:id => 1, :content => "Let's have a dinner!"}

 # Let's index the message 1:
 #
 tokens = tokenize(message[:content])
 puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

 tokens.each do |token|
  put 'messages', 'mary', "index:#{token}", '', message[:id]
 end

 # Mary receives another message...
 #
 message = {:id => 2, :content => "Hmm, dinner? What about just a coffee?"}

 # Let's index the message 2:
 #
 tokens = tokenize(message[:content])
 puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

 tokens.each do |token|
  put 'messages', 'mary', "index:#{token}", '', message[:id]
 end

 # OK, how does the index look like for Mary's messages, now?
 puts "Index for Mary's messages contains these tokens (columns):"
 puts ">  get 'messages', 'mary', 'index'", ""
 get 'messages', 'mary', 'index'

 # Let's search for last 10 Mary's messages containing some terms, such as 'dinner' or 'coffee'
 #
 query = 'dinner coffee'

 search(query)
	# --------------------------------------------------------------
	# Simplified model of Facebook's Message Inbox Search with HBase
	# --------------------------------------------------------------
	#
	# Facebook exploits versioning support in HBase with a very interesting twist:
	# it stores message IDs for given token as “custom timestamps” in the database.
	#
	# The [HBase: The Definitive Guide](http://ofps.oreilly.com/titles/9781449396107/advanced.html#advsearch) book says (p. 385):
	#
	# > A prominent implementation of a client managed solution is the Facebook inbox search. The schema is built roughly like this:
	# >
	# > * Every row is a single inbox, i.e., every user has a single row in the search table,
	# >
	# > * the columns are the terms indexed from the messages,
	# >
	# > * the versions are the message IDs,
	# >
	# > * the values contain additional information, such as the position of the term in the document.
	#
	# See also the [Facebook Messages & HBase](http://www.slideshare.net/brizzzdotcom/facebook-messages-hbase/14) presentation.
	#
	# Run the example with:
	#
	# $ hbase shell facebook-messages-search.rb
	#
	# --------------------------------------------------------------

	# First, some auxiliary infrastructure:

	# 1) Let's define some stopwords for the tokenization process.
	#
	STOPWORDS = %w\|a an and are as at but by for if in is it no not of on or that the then there these they this to was will with\|

	# 2) Let's define a method to create tokens from the text stream.
	#
	def tokenize content
	content.split(/\W/).
	map { \|word\| word.downcase }.
	reject { \|word\| STOPWORDS.include?(word) \|\| word == '' }
	end

	# 3) Let's define a method to search user's messages for given words.
	#
	def search words
	columns = tokenize(words).map { \|t\| "index:#{t}" }
	puts "Let's search for words #{tokenize(words).map { \|t\| "'#{t}'" }.join(', ')}:"
	puts "> get 'messages', 'mary', { COLUMNS => #{columns.inspect}, VERSIONS => 10 }", ""

	get 'messages', 'mary', { COLUMNS => columns, VERSIONS => 10 }
	end

	# Now, let's add some data.

	# Create the table to hold the index for messages. Every user has one row in the table.
	#
	disable 'messages'
	drop 'messages'
	create 'messages', {NAME => 'index', VERSIONS => 1000}

	# Mary receives a message...
	#
	message = {:id => 1, :content => "Let's have a dinner!"}

	# Let's index the message 1:
	#
	tokens = tokenize(message[:content])
	puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

	tokens.each do \|token\|
	put 'messages', 'mary', "index:#{token}", '', message[:id]
	end

	# Mary receives another message...
	#
	message = {:id => 2, :content => "Hmm, dinner? What about just a coffee?"}

	# Let's index the message 2:
	#
	tokens = tokenize(message[:content])
	puts "Analyzed content '#{message[:content]}' as: #{tokens.join(', ')}"

	tokens.each do \|token\|
	put 'messages', 'mary', "index:#{token}", '', message[:id]
	end

	# OK, how does the index look like for Mary's messages, now?
	puts "Index for Mary's messages contains these tokens (columns):"
	puts "> get 'messages', 'mary', 'index'", ""
	get 'messages', 'mary', 'index'

	# Let's search for last 10 Mary's messages containing some terms, such as 'dinner' or 'coffee'
	#
	query = 'dinner coffee'

	search(query)