Skip to content

Instantly share code, notes, and snippets.

@max-mapper
Created September 11, 2017 19:34
Show Gist options
  • Save max-mapper/45c0312e57031e8643b799529b4dec46 to your computer and use it in GitHub Desktop.
Save max-mapper/45c0312e57031e8643b799529b4dec46 to your computer and use it in GitHub Desktop.
scrape millions of html files in a folder structure
var $ = require('cheerio')
var fs = require('fs')
var walker = require('folder-walker')
var transform = require('parallel-transform')
var ndjson = require('ndjson')
var walk = walker('./pageblobs') // generated by abstract-blob-store
var scraper = transform(10, scrape)
var out = ndjson.serialize()
walk.pipe(scraper).pipe(out).pipe(process.stdout)
function scrape (entry, cb) {
if (entry.type === 'directory') return cb()
var file = entry.filepath
fs.readFile(file, function (err, buff) {
if (err) return cb(err)
var htmlString = buff.toString()
var parsedHTML = $.load(htmlString)
var links = []
parsedHTML('.some-class').map(function(i, foo) {
// the foo html element into a cheerio object (same pattern as jQuery)
foo = $(foo)
links.push(foo.attr('href'))
})
cb(null, {links: links})
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment