Skip to content

Instantly share code, notes, and snippets.

@giacecco
Created July 18, 2014 14:40
Show Gist options
  • Save giacecco/aca6b5f86328c817c2b5 to your computer and use it in GitHub Desktop.
Save giacecco/aca6b5f86328c817c2b5 to your computer and use it in GitHub Desktop.
Prints lists of articles from Wiley journals
var JOURNALS = {
D: 'journal/10.1002/(ISSN)1860-1324', // Deutsche Entomologische Zeitschrift
F: 'journal/10.1002/(ISSN)1860-1014', // Fossil Record
Z: 'journal/10.1002/(ISSN)1860-0743a', // Zoosystematics and Evolution
};
var argv = require('yargs')
.usage('Usage: $0 --journal D|F|Z --year <year>|all [--throttle <max_requests_per_minute>]')
.example('$0 --journal Z --year 2011', 'prints the urls for all Zoosystematics and Evolution articles in 2011')
.demand([ 'journal', 'year' ])
.default('throttle', 30) // max number of requests per minute
.argv, // https://github.com/chevex/yargs
async = require('async'), // https://github.com/caolan/async
cheerio = require('cheerio'), // https://github.com/cheeriojs/cheerio
request = require('request'), // https://github.com/mikeal/request
RateLimiter = require('limiter').RateLimiter, // https://github.com/jhurliman/node-rate-limiter
limiter = new RateLimiter(parseInt(argv.throttle), 'minute');
var getAllYears = function (journal, callback) {
var results = [ ];
limiter.removeTokens(1, function() {
request('http://onlinelibrary.wiley.com/' + journal + '/issues', function (error, response, body) {
if (error || response.statusCode != 200) callback(new Error("Something is wrong"));
var $ = cheerio.load(body);
$("#allIssues ol li").each(function (index, element) {
if ($(this, "a").attr("id")) {
results.push($(this, "a").html().match(/href="[\s\S]*=(\d\d\d\d)" class/)[1]);
}
});
callback(null, results);
});
});
}
var getIssuesByYear = function (journal, year, callback) {
year = year.toString();
var results = [ ];
limiter.removeTokens(1, function() {
request('http://onlinelibrary.wiley.com/' + journal + '/issues?activeYear=' + year, function (error, response, body) {
if (error || response.statusCode != 200) callback(new Error("Something is wrong"));
var $ = cheerio.load(body);
$("#year_" + year + " ol li").each(function (index, element) {
results.push("http://onlinelibrary.wiley.com" + $(this, "a").html().match(/<a href="([\s\S]*)" shape=/)[1]);
});
callback(null, results);
});
});
}
var getArticlesByIssue = function (issueUrl, callback) {
var results = [ ];
limiter.removeTokens(1, function() {
request(issueUrl, function (error, response, body) {
if (error || response.statusCode != 200) callback(new Error("Something is wrong"));
var $ = cheerio.load(body);
// $("#group1 ol.articles li").each(function (index, element) {
$("li").each(function (index, element) {
if ($(this, "a").text() === "Abstract") {
results.push("http://onlinelibrary.wiley.com" + $(this, "a").html().match(/<a href="([\s\S]*)\/abstract"/)[1] + "/abstract");
}
});
callback(null, results);
});
});
}
var getArticlesByYear = function (journal, year, callback) {
year = year.toString();
getIssuesByYear(journal, year, function (err, issuesUrls) {
async.reduce(issuesUrls, [ ], function (memo, issueUrl, callback) {
getArticlesByIssue(issueUrl, function (err, articlesUrls) {
callback(err, memo.concat(articlesUrls));
});
}, callback);
});
}
var fetch = function (journal) {
var fetch2 = function (years) {
async.reduce(years, [ ], function (memo, year, callback) {
getArticlesByYear(journal, year, function (err, articlesUrls) {
callback(err, memo.concat(articlesUrls));
});
}, function (err, articlesUrls) {
console.log(articlesUrls.join("\n"));
});
}
argv.year = argv.year.toString();
if (argv.year.toLowerCase() !== "all") {
fetch2([ ].concat(argv.year));
} else {
getAllYears(journal, function (err, years) {
fetch2(years);
});
}
}
fetch(JOURNALS[argv.journal]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment