Skip to content

Instantly share code, notes, and snippets.

@thibauts
Last active August 3, 2018 11:50
Show Gist options
  • Save thibauts/b276cbd41c31e73eeb40 to your computer and use it in GitHub Desktop.
Save thibauts/b276cbd41c31e73eeb40 to your computer and use it in GitHub Desktop.
Extract links from a webpage the streaming way
var request = require('request');
var extractLinksStream = extractStream.bind(null, /<a.*?href="(.*?)".*?>(.*?)<\/a>/gi);
request(process.argv[2]).pipe(extractLinksStream(function(found) {
var hrefs = found.map(function(item) { return item[1]; }); // Extract 1st capture
console.log(hrefs);
}));
/* ----------------------------------------------- */
var concat = require('concat-stream');
function extractStream(re, callback) {
return concat(function(buf) {
callback(extract(buf.toString(), re));
});
}
function extract(str, re) {
var found = [];
var matches;
while(matches = re.exec(str)) {
delete matches.index;
delete matches.input;
found.push(matches);
};
return found;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment