Last active
December 12, 2017 02:36
-
-
Save agarie/a417613c092bf11f32fb2e48d5aa8e59 to your computer and use it in GitHub Desktop.
Quick script I made to download papers from NIPS. The subjects are specified in `SUBJECTS_RE`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'fileutils' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'pp' | |
require 'typhoeus' | |
SUBJECTS_RE = /deep|deeply|neural|convolutional|network|recurrent|lstm|object recognition|object classification|object detection|image classification/ | |
def paper_list_url(issue) | |
"https://papers.nips.cc/book/advances-in-neural-information-processing-systems-#{issue}-#{1987 + issue}" | |
end | |
def abstract_eligible?(response_body) | |
abstract = Nokogiri::HTML(response_body).css("p.abstract").text | |
abstract.downcase =~ SUBJECTS_RE | |
end | |
def download_to_file(content, filename) | |
puts "Downloading: #{filename}" | |
File.open(filename, "wb") do |f| | |
f.write(content) | |
end | |
end | |
def download_articles(issue) | |
base_url = paper_list_url(issue) | |
nips_dir = "NIPS-#{issue}" | |
FileUtils.mkdir(nips_dir) unless Dir.exists? nips_dir | |
hydra = Typhoeus::Hydra.new(max_concurrency: 4) | |
open(base_url) do |f| | |
papers = Nokogiri::HTML(f).css("a[href]").select { |e| e['href'] =~ /^\/paper\// } | |
papers.each do |paper| | |
# This part is done offline: generate URLs and filenames. | |
paper_url = File.join("https://papers.nips.cc", paper['href']) | |
pdf_url = paper_url + ".pdf" | |
bibtex_url = paper_url + "/bibtex" | |
pdf_name = File.join(nips_dir, File.basename(pdf_url)) | |
bibtex_name = pdf_name.sub(".pdf", ".bib") | |
# Now we're going to create requests for each paper page, PDF and bibtex. | |
request = Typhoeus::Request.new(paper_url) | |
request.on_complete do |response| | |
puts "Analysing #{paper_url}" | |
if abstract_eligible?(response.body) | |
pdf_request = Typhoeus::Request.new(pdf_url) | |
pdf_request.on_complete do |pdf_response| | |
download_to_file(pdf_response.body, pdf_name) | |
end | |
hydra.queue pdf_request | |
bib_request = Typhoeus::Request.new(bibtex_url) | |
bib_request.on_complete do |bib_response| | |
download_to_file(bib_response.body, bibtex_name) | |
end | |
hydra.queue bib_request | |
end | |
end | |
hydra.queue request | |
end | |
end | |
hydra.run | |
end | |
download_articles(ARGV.first.to_i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment