Created
September 11, 2017 19:34
-
-
Save max-mapper/45c0312e57031e8643b799529b4dec46 to your computer and use it in GitHub Desktop.
scrape millions of html files in a folder structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var $ = require('cheerio') | |
var fs = require('fs') | |
var walker = require('folder-walker') | |
var transform = require('parallel-transform') | |
var ndjson = require('ndjson') | |
var walk = walker('./pageblobs') // generated by abstract-blob-store | |
var scraper = transform(10, scrape) | |
var out = ndjson.serialize() | |
walk.pipe(scraper).pipe(out).pipe(process.stdout) | |
function scrape (entry, cb) { | |
if (entry.type === 'directory') return cb() | |
var file = entry.filepath | |
fs.readFile(file, function (err, buff) { | |
if (err) return cb(err) | |
var htmlString = buff.toString() | |
var parsedHTML = $.load(htmlString) | |
var links = [] | |
parsedHTML('.some-class').map(function(i, foo) { | |
// the foo html element into a cheerio object (same pattern as jQuery) | |
foo = $(foo) | |
links.push(foo.attr('href')) | |
}) | |
cb(null, {links: links}) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment