Skip to content

Instantly share code, notes, and snippets.

@kornypoet
Created November 15, 2011 23:22
Show Gist options
  • Save kornypoet/1368717 to your computer and use it in GitHub Desktop.
Save kornypoet/1368717 to your computer and use it in GitHub Desktop.
require 'rspec'
require './search_scraper'
require 'em-synchrony'
def tmp_dir() File.join(File.dirname(__FILE__), 'tmp') ; end
def ripd_dir() File.join(tmp_dir, 'social.network.twitter') ; end
def cfg_hash
{
:namespace => "social.network.twitter",
:protocol => "twitter_search",
:url => "http://search.twitter.com/search.json",
:ripd_dir => tmp_dir,
:keywords => %w[ fail occupy oldspice ]
}
end
describe SearchScraper do
before :all do
FileUtils.mkdir_p ripd_dir
end
subject { SearchScraper.new cfg_hash }
its(:data_file) { should match("#{ripd_dir}/twitter_search") }
its(:connection) { should be_instance_of(EventMachine::HttpConnection) }
its(:config) { should == cfg_hash }
its(:keywords) { should == [ "fail", "occupy", "oldspice" ] }
it "should memoize its ripd_file" do
subject.ripd_file.should be(subject.ripd_file)
end
it "should include a Log module" do
SearchScraper.const_defined?(:Log).should be_true
end
let(:header) { double :header, :status => 200 }
let(:res) { double :res, :response => '{"foo":"bar"}', :response_header => header }
it "should write to the ripd_file and flush the file afterwards" do
subject.ripd_file.should_receive(:puts).with( "200\t{\"foo\":\"bar\"}").ordered
subject.ripd_file.should_receive(:flush).ordered
subject.write(res)
end
it "should properly format writes to the ripd_file" do
subject.write(res)
File.read(subject.ripd_file).should == "200\t{\"foo\":\"bar\"}\n"
end
it "should properly construct the params hash" do
subject.params("foo").should == { :query => { "rpp" => 100, "q" => "foo" } }
subject.params("bar", 2).should == { :query => { "rpp" => 100, "q" => "bar", "page" => 2 } }
end
it "should shutdown properly" do
Log.should_receive(:info).with("Search scrape complete. Completions: 100. Failures: 1")
subject.ripd_file.should_receive(:close)
# EventMachine.should_receive(:stop)
EventMachine.synchrony do
subject.complete(100, 1)
end
end
after :all do
FileUtils.remove_dir ripd_dir
end
end
################################
require 'rubygems'
require 'eventmachine'
require 'em-http'
require 'em-http/middleware/oauth'
require 'wukong/filename_pattern'
require 'gorillib'
require 'gorillib/logger/log'
require 'gorillib/hash/deep_compact'
require 'configliere' ; Settings.use(:commandline).resolve!
Settings.read Settings.rest.first
class SearchScraper
attr_reader :connection, :data_file, :ripd_file, :config, :keywords
def initialize(options={})
@data_file = Wukong::FilenamePattern.new(
":ripd_dir/:namespace/:protocol-:timestamp-:hostname-:pid.json",
:ripd_dir => options[:ripd_dir],
:namespace => options[:namespace],
:protocol => options[:protocol]
).make
@keywords = options[:keywords]
@connection = EventMachine::HttpRequest.new options[:url]
@connection.use EventMachine::Middleware::OAuth, options[:oauth] if options[:oauth]
@config = options
end
def ripd_file
@ripd_file ||= File.open(data_file, 'a')
end
def complete comps, fails
Log.info("Search scrape complete. Completions: #{comps}. Failures: #{fails}")
ripd_file.close
EventMachine.stop
end
def params(keyword, page=nil)
{ :query => { "rpp" => 100, "q" => keyword, "page" => page } }.deep_compact!
end
def write res
code = res.response_header.status
body = res.response
ripd_file.puts [code, body].join("\t")
ripd_file.flush
end
def run!
Log.info("Connection opened. Job details:\n#{config.inspect}")
http = connection.get(params)
http.callback{ write(http) ; complete(:success, http.response_header) }
http.errback{ complete(:error, http.response_header) }
end
end
# EventMachine.run do
# StreamingScraper.new(Settings).run!
# end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment