Created
November 15, 2011 23:22
-
-
Save kornypoet/1368717 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'rspec' | |
| require './search_scraper' | |
| require 'em-synchrony' | |
| def tmp_dir() File.join(File.dirname(__FILE__), 'tmp') ; end | |
| def ripd_dir() File.join(tmp_dir, 'social.network.twitter') ; end | |
| def cfg_hash | |
| { | |
| :namespace => "social.network.twitter", | |
| :protocol => "twitter_search", | |
| :url => "http://search.twitter.com/search.json", | |
| :ripd_dir => tmp_dir, | |
| :keywords => %w[ fail occupy oldspice ] | |
| } | |
| end | |
| describe SearchScraper do | |
| before :all do | |
| FileUtils.mkdir_p ripd_dir | |
| end | |
| subject { SearchScraper.new cfg_hash } | |
| its(:data_file) { should match("#{ripd_dir}/twitter_search") } | |
| its(:connection) { should be_instance_of(EventMachine::HttpConnection) } | |
| its(:config) { should == cfg_hash } | |
| its(:keywords) { should == [ "fail", "occupy", "oldspice" ] } | |
| it "should memoize its ripd_file" do | |
| subject.ripd_file.should be(subject.ripd_file) | |
| end | |
| it "should include a Log module" do | |
| SearchScraper.const_defined?(:Log).should be_true | |
| end | |
| let(:header) { double :header, :status => 200 } | |
| let(:res) { double :res, :response => '{"foo":"bar"}', :response_header => header } | |
| it "should write to the ripd_file and flush the file afterwards" do | |
| subject.ripd_file.should_receive(:puts).with( "200\t{\"foo\":\"bar\"}").ordered | |
| subject.ripd_file.should_receive(:flush).ordered | |
| subject.write(res) | |
| end | |
| it "should properly format writes to the ripd_file" do | |
| subject.write(res) | |
| File.read(subject.ripd_file).should == "200\t{\"foo\":\"bar\"}\n" | |
| end | |
| it "should properly construct the params hash" do | |
| subject.params("foo").should == { :query => { "rpp" => 100, "q" => "foo" } } | |
| subject.params("bar", 2).should == { :query => { "rpp" => 100, "q" => "bar", "page" => 2 } } | |
| end | |
| it "should shutdown properly" do | |
| Log.should_receive(:info).with("Search scrape complete. Completions: 100. Failures: 1") | |
| subject.ripd_file.should_receive(:close) | |
| # EventMachine.should_receive(:stop) | |
| EventMachine.synchrony do | |
| subject.complete(100, 1) | |
| end | |
| end | |
| after :all do | |
| FileUtils.remove_dir ripd_dir | |
| end | |
| end | |
| ################################ | |
| require 'rubygems' | |
| require 'eventmachine' | |
| require 'em-http' | |
| require 'em-http/middleware/oauth' | |
| require 'wukong/filename_pattern' | |
| require 'gorillib' | |
| require 'gorillib/logger/log' | |
| require 'gorillib/hash/deep_compact' | |
| require 'configliere' ; Settings.use(:commandline).resolve! | |
| Settings.read Settings.rest.first | |
| class SearchScraper | |
| attr_reader :connection, :data_file, :ripd_file, :config, :keywords | |
| def initialize(options={}) | |
| @data_file = Wukong::FilenamePattern.new( | |
| ":ripd_dir/:namespace/:protocol-:timestamp-:hostname-:pid.json", | |
| :ripd_dir => options[:ripd_dir], | |
| :namespace => options[:namespace], | |
| :protocol => options[:protocol] | |
| ).make | |
| @keywords = options[:keywords] | |
| @connection = EventMachine::HttpRequest.new options[:url] | |
| @connection.use EventMachine::Middleware::OAuth, options[:oauth] if options[:oauth] | |
| @config = options | |
| end | |
| def ripd_file | |
| @ripd_file ||= File.open(data_file, 'a') | |
| end | |
| def complete comps, fails | |
| Log.info("Search scrape complete. Completions: #{comps}. Failures: #{fails}") | |
| ripd_file.close | |
| EventMachine.stop | |
| end | |
| def params(keyword, page=nil) | |
| { :query => { "rpp" => 100, "q" => keyword, "page" => page } }.deep_compact! | |
| end | |
| def write res | |
| code = res.response_header.status | |
| body = res.response | |
| ripd_file.puts [code, body].join("\t") | |
| ripd_file.flush | |
| end | |
| def run! | |
| Log.info("Connection opened. Job details:\n#{config.inspect}") | |
| http = connection.get(params) | |
| http.callback{ write(http) ; complete(:success, http.response_header) } | |
| http.errback{ complete(:error, http.response_header) } | |
| end | |
| end | |
| # EventMachine.run do | |
| # StreamingScraper.new(Settings).run! | |
| # end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment