Skip to content

Instantly share code, notes, and snippets.

@blahah
Last active August 29, 2015 14:22
Show Gist options
  • Save blahah/95bf793b3c9ddba2d4b6 to your computer and use it in GitHub Desktop.
Save blahah/95bf793b3c9ddba2d4b6 to your computer and use it in GitHub Desktop.
scrape ieee fulltext HTML by rendering in phantom
var webPage = require('webpage');
var fs = require('fs');
var page = webPage.create();
var urls = fs.read("fulltext_html_urls.txt").split("\n");
urls = urls.splice(0, urls.length - 1);
var ar_regex = /([0-9]+$)/;
console.log("scraping " + urls.length + " URLs");
var pageindex = 0;
var fileName = '';
var loadInProgress = false;
var interval = setInterval(function() {
if (!loadInProgress && pageindex < urls.length) {
fileName = urls[pageindex];
page.open(urls[pageindex]);
}
if (pageindex == urls.length) {
console.log("Scraping complete!");
phantom.exit();
}
}, 250);
page.onLoadStarted = function() {
loadInProgress = true;
};
page.onLoadFinished = function(status) {
console.log('status: ' + status);
loadInProgress = false;
var arnumber = ar_regex.exec(urls[pageindex])[1];
var dest = arnumber + ".html";
console.log('saving: ' + dest);
fs.write(dest, page.content); //page source
pageindex++;
}
page.onResourceError = function(resourceError) {
console.log('Unable to load resource (#' + resourceError.id + 'URL:' + resourceError.url + ')');
console.log('Error code: ' + resourceError.errorCode + '. Description: ' + resourceError.errorString);
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment