Created
July 18, 2014 14:40
-
-
Save giacecco/aca6b5f86328c817c2b5 to your computer and use it in GitHub Desktop.
Prints lists of articles from Wiley journals
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var JOURNALS = { | |
D: 'journal/10.1002/(ISSN)1860-1324', // Deutsche Entomologische Zeitschrift | |
F: 'journal/10.1002/(ISSN)1860-1014', // Fossil Record | |
Z: 'journal/10.1002/(ISSN)1860-0743a', // Zoosystematics and Evolution | |
}; | |
var argv = require('yargs') | |
.usage('Usage: $0 --journal D|F|Z --year <year>|all [--throttle <max_requests_per_minute>]') | |
.example('$0 --journal Z --year 2011', 'prints the urls for all Zoosystematics and Evolution articles in 2011') | |
.demand([ 'journal', 'year' ]) | |
.default('throttle', 30) // max number of requests per minute | |
.argv, // https://github.com/chevex/yargs | |
async = require('async'), // https://github.com/caolan/async | |
cheerio = require('cheerio'), // https://github.com/cheeriojs/cheerio | |
request = require('request'), // https://github.com/mikeal/request | |
RateLimiter = require('limiter').RateLimiter, // https://github.com/jhurliman/node-rate-limiter | |
limiter = new RateLimiter(parseInt(argv.throttle), 'minute'); | |
var getAllYears = function (journal, callback) { | |
var results = [ ]; | |
limiter.removeTokens(1, function() { | |
request('http://onlinelibrary.wiley.com/' + journal + '/issues', function (error, response, body) { | |
if (error || response.statusCode != 200) callback(new Error("Something is wrong")); | |
var $ = cheerio.load(body); | |
$("#allIssues ol li").each(function (index, element) { | |
if ($(this, "a").attr("id")) { | |
results.push($(this, "a").html().match(/href="[\s\S]*=(\d\d\d\d)" class/)[1]); | |
} | |
}); | |
callback(null, results); | |
}); | |
}); | |
} | |
var getIssuesByYear = function (journal, year, callback) { | |
year = year.toString(); | |
var results = [ ]; | |
limiter.removeTokens(1, function() { | |
request('http://onlinelibrary.wiley.com/' + journal + '/issues?activeYear=' + year, function (error, response, body) { | |
if (error || response.statusCode != 200) callback(new Error("Something is wrong")); | |
var $ = cheerio.load(body); | |
$("#year_" + year + " ol li").each(function (index, element) { | |
results.push("http://onlinelibrary.wiley.com" + $(this, "a").html().match(/<a href="([\s\S]*)" shape=/)[1]); | |
}); | |
callback(null, results); | |
}); | |
}); | |
} | |
var getArticlesByIssue = function (issueUrl, callback) { | |
var results = [ ]; | |
limiter.removeTokens(1, function() { | |
request(issueUrl, function (error, response, body) { | |
if (error || response.statusCode != 200) callback(new Error("Something is wrong")); | |
var $ = cheerio.load(body); | |
// $("#group1 ol.articles li").each(function (index, element) { | |
$("li").each(function (index, element) { | |
if ($(this, "a").text() === "Abstract") { | |
results.push("http://onlinelibrary.wiley.com" + $(this, "a").html().match(/<a href="([\s\S]*)\/abstract"/)[1] + "/abstract"); | |
} | |
}); | |
callback(null, results); | |
}); | |
}); | |
} | |
var getArticlesByYear = function (journal, year, callback) { | |
year = year.toString(); | |
getIssuesByYear(journal, year, function (err, issuesUrls) { | |
async.reduce(issuesUrls, [ ], function (memo, issueUrl, callback) { | |
getArticlesByIssue(issueUrl, function (err, articlesUrls) { | |
callback(err, memo.concat(articlesUrls)); | |
}); | |
}, callback); | |
}); | |
} | |
var fetch = function (journal) { | |
var fetch2 = function (years) { | |
async.reduce(years, [ ], function (memo, year, callback) { | |
getArticlesByYear(journal, year, function (err, articlesUrls) { | |
callback(err, memo.concat(articlesUrls)); | |
}); | |
}, function (err, articlesUrls) { | |
console.log(articlesUrls.join("\n")); | |
}); | |
} | |
argv.year = argv.year.toString(); | |
if (argv.year.toLowerCase() !== "all") { | |
fetch2([ ].concat(argv.year)); | |
} else { | |
getAllYears(journal, function (err, years) { | |
fetch2(years); | |
}); | |
} | |
} | |
fetch(JOURNALS[argv.journal]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment