rweald · June 16, 2011 21:16
diff --git a/gistfile1.rb b/gistfile1.rb
 require 'em-synchrony'
 require 'em-synchrony/em-http'
 require 'em-synchrony/em-jack'
 require 'nokogiri'

 module MendeleyScraper
  class MasterScraper
    # get all the necessary urls from the mendeley sitemap
    # this opperation appear sync due to fibers

    def self.extract_sitemap(sitemap_url)
      puts "extracting locs at #{sitemap_url}"
      http = EventMachine::HttpRequest.new(sitemap_url).get
      puts 'parsing the file'
      sitemap = Nokogiri::XML(http.response)
      loc_list = sitemap.css("loc").map { |ar| ar.content }
      return loc_list
    end

    #this method will extract the specific article urls from the gziped 
    # article list xml files that mendely provides
    # This opperation is async
    def self.extract_article_locations(http)
      #extract the article urls from the gzipped xml file
      sitemap = Nokogiri::XML(Zlib::GzipReader.new(StringIO.new(http)).read)
      loc_list = sitemap.css("loc").map { |ar| ar.content }

      #put each of the articles locations on our job queue
      puts "connecting to our beanstalk server"
      @jack = EMJack::Connection.new
      @jack.use("urls_to_scrape")

      EM::Synchrony::Iterator.new(loc_list[0..10], 10).each do |url, iter|
        Fiber.new do
          puts url
          EM::Synchrony.sync @jack.put(url)
          iter.next
        end.resume
      end
    end

    # The main method that will execute the master scraper
    # this method uses a combo of fiber synced and async operations
    def self.run
      EM.synchrony do
        article_lists = extract_sitemap("http://www.mendeley.com/sitemap-index-articles.xml")

        EM::Synchrony::Iterator.new(article_lists[0..1], 2).each do |url , iter|
          puts "extracting the locations of articles from #{url}"
          http = EventMachine::HttpRequest.new(url).aget
          http.callback {
            Fiber.new do
              extract_article_locations(http.response)
            end.resume
            iter.next 
          }
          http.errback { puts "failed to extract articles list from #{url}"; iter.next }
        end

        EM.stop
      end

    end
  end
 end
	require 'em-synchrony'
	require 'em-synchrony/em-http'
	require 'em-synchrony/em-jack'
	require 'nokogiri'

	module MendeleyScraper
	class MasterScraper
	# get all the necessary urls from the mendeley sitemap
	# this opperation appear sync due to fibers

	def self.extract_sitemap(sitemap_url)
	puts "extracting locs at #{sitemap_url}"
	http = EventMachine::HttpRequest.new(sitemap_url).get
	puts 'parsing the file'
	sitemap = Nokogiri::XML(http.response)
	loc_list = sitemap.css("loc").map { \|ar\| ar.content }
	return loc_list
	end

	#this method will extract the specific article urls from the gziped
	# article list xml files that mendely provides
	# This opperation is async
	def self.extract_article_locations(http)
	#extract the article urls from the gzipped xml file
	sitemap = Nokogiri::XML(Zlib::GzipReader.new(StringIO.new(http)).read)
	loc_list = sitemap.css("loc").map { \|ar\| ar.content }

	#put each of the articles locations on our job queue
	puts "connecting to our beanstalk server"
	@jack = EMJack::Connection.new
	@jack.use("urls_to_scrape")

	EM::Synchrony::Iterator.new(loc_list[0..10], 10).each do \|url, iter\|
	Fiber.new do
	puts url
	EM::Synchrony.sync @jack.put(url)
	iter.next
	end.resume
	end
	end

	# The main method that will execute the master scraper
	# this method uses a combo of fiber synced and async operations
	def self.run
	EM.synchrony do
	article_lists = extract_sitemap("http://www.mendeley.com/sitemap-index-articles.xml")

	EM::Synchrony::Iterator.new(article_lists[0..1], 2).each do \|url , iter\|
	puts "extracting the locations of articles from #{url}"
	http = EventMachine::HttpRequest.new(url).aget
	http.callback {
	Fiber.new do
	extract_article_locations(http.response)
	end.resume
	iter.next
	}
	http.errback { puts "failed to extract articles list from #{url}"; iter.next }
	end

	EM.stop
	end

	end
	end
	end