Created
June 16, 2011 21:16
-
-
Save rweald/1030307 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'em-synchrony' | |
require 'em-synchrony/em-http' | |
require 'em-synchrony/em-jack' | |
require 'nokogiri' | |
module MendeleyScraper | |
class MasterScraper | |
# get all the necessary urls from the mendeley sitemap | |
# this opperation appear sync due to fibers | |
def self.extract_sitemap(sitemap_url) | |
puts "extracting locs at #{sitemap_url}" | |
http = EventMachine::HttpRequest.new(sitemap_url).get | |
puts 'parsing the file' | |
sitemap = Nokogiri::XML(http.response) | |
loc_list = sitemap.css("loc").map { |ar| ar.content } | |
return loc_list | |
end | |
#this method will extract the specific article urls from the gziped | |
# article list xml files that mendely provides | |
# This opperation is async | |
def self.extract_article_locations(http) | |
#extract the article urls from the gzipped xml file | |
sitemap = Nokogiri::XML(Zlib::GzipReader.new(StringIO.new(http)).read) | |
loc_list = sitemap.css("loc").map { |ar| ar.content } | |
#put each of the articles locations on our job queue | |
puts "connecting to our beanstalk server" | |
@jack = EMJack::Connection.new | |
@jack.use("urls_to_scrape") | |
EM::Synchrony::Iterator.new(loc_list[0..10], 10).each do |url, iter| | |
Fiber.new do | |
puts url | |
EM::Synchrony.sync @jack.put(url) | |
iter.next | |
end.resume | |
end | |
end | |
# The main method that will execute the master scraper | |
# this method uses a combo of fiber synced and async operations | |
def self.run | |
EM.synchrony do | |
article_lists = extract_sitemap("http://www.mendeley.com/sitemap-index-articles.xml") | |
EM::Synchrony::Iterator.new(article_lists[0..1], 2).each do |url , iter| | |
puts "extracting the locations of articles from #{url}" | |
http = EventMachine::HttpRequest.new(url).aget | |
http.callback { | |
Fiber.new do | |
extract_article_locations(http.response) | |
end.resume | |
iter.next | |
} | |
http.errback { puts "failed to extract articles list from #{url}"; iter.next } | |
end | |
EM.stop | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment