-
-
Save jsooriah/953722 to your computer and use it in GitHub Desktop.
Importing and searching RSS with ElasticSearch and Tire
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ======================================================= | |
# Importing and searching RSS with ElasticSearch and Tire | |
# ======================================================= | |
# | |
# This script downloads, parses and indexes Stackoverflow RSS feed with ElasticSearch | |
# via the [Tire](https://github.com/karmi/tire) Rubygem. | |
# | |
# Requirements | |
# ------------ | |
# | |
# * Sun Java 6 (for ElasticSearch) | |
# * Ruby >= 1.8.7 | |
# * Rubygems >= 1.5.0 | |
# | |
# Usage | |
# ----- | |
# | |
# ruby elasticoverflow.rb | |
# | |
require 'rubygems' | |
require 'open-uri' | |
require 'benchmark' | |
# Check for required Rubygems, exit otherwise | |
# | |
%w[ tire nokogiri ].each do |lib| | |
begin | |
require lib | |
rescue LoadError | |
STDERR.puts "[ERROR] Required library '#{lib}' missing.", " Please install it with:", " $ gem install #{lib}", "\n" | |
exit(1) | |
end | |
end | |
# Check if ElasticSearch is running on this machine, exit otherwise | |
# | |
( puts <<-"INSTALL" ; exit(1) ) unless (RestClient.get('http://localhost:9200') rescue false) | |
[ERROR] You don’t appear to have ElasticSearch installed. Please install and launch it with the following commands: | |
curl -k -L -o elasticsearch-0.16.0.tar.gz http://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.16.0.tar.gz | |
tar -zxvf elasticsearch-0.16.0.tar.gz | |
./elasticsearch-0.16.0/bin/elasticsearch -f | |
INSTALL | |
URL = 'http://stackoverflow.com/feeds' | |
puts "", "Fetching data from '#{URL}'...", "-"*80 | |
# Parse the Stackoverflow RSS | |
# | |
feed = Nokogiri::HTML(open(URL)) | |
# Prepare the documents | |
# | |
documents = feed.search("//entry").map do |entry| | |
result = {} | |
result[:type] = 'question' | |
result[:id] = entry.xpath("id").text[/questions\/(\d+)\//, 1] | |
result[:title] = entry.xpath("title").text | |
result[:link] = entry.xpath("link[@rel='alternate']/@href").text | |
result[:categories] = entry.xpath("category/@term").map { |c| c.to_s } | |
result[:author] = entry.xpath("author/name").text | |
result[:published] = entry.xpath("published").text | |
result[:summary] = entry.xpath("summary").text | |
result | |
end | |
puts "", "Importing these #{documents.size} documents:", "-"*80 | |
documents.each { |document| puts "* #{document[:title]}" } | |
elapsed = Benchmark.realtime do | |
Tire.index 'stackoverflow' do | |
# Create the index with proper mapping (if not exists already) | |
# | |
create :mappings => { | |
:question => { | |
:properties => { | |
:id => { :type => 'string', :analyzer => 'keyword' }, | |
:link => { :type => 'string', :analyzer => 'keyword' }, | |
:categories => { :type => 'string', :analyzer => 'keyword' }, | |
:author => { :type => 'string', :analyzer => 'keyword' }, | |
:title => { :type => 'string', :analyzer => 'snowball' }, | |
:summary => { :type => 'string', :analyzer => 'snowball' } | |
} | |
} | |
} | |
# Import documents | |
import documents | |
# Refresh the index for immediate searching | |
# | |
refresh | |
end | |
end | |
puts "-"*80, "Importing took #{(elapsed*1000).to_i} milliseconds" | |
puts "", "Searching...", "-"*80 | |
s = Tire.search('stackoverflow') do | |
# Search for questions containing ‘ruby’ | |
# | |
query { string 'ruby' } | |
# Retrieve aggregated counts for top ten categories | |
# | |
facet('categories') { terms :categories, :global => true } | |
end | |
puts "Search took #{s.results.time} milliseconds." | |
puts "", "Any questions about ruby?", "-"*80 | |
s.results.each do |d| | |
puts "#{ d.author } : #{d.title} [#{d.categories.join(', ')}]" | |
end | |
puts "", "Top 10 categories in database:", "-"*80 | |
s.results.facets['categories']['terms'].each do |f| | |
puts "#{f['term'].ljust(15)} #{f['count']}" | |
end | |
puts "", "Or, try the search with curl:", "-"*80 | |
puts s.to_curl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment