Created
May 22, 2018 21:44
-
-
Save kadamwhite/c0ffa6087fce11e9b48d03a161e63bf8 to your computer and use it in GitHub Desktop.
Scrape Gutenberg handbook content to local markdown & images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
------------- | |
mkdirp usage: | |
------------- | |
var mkdirp = require('mkdirp'); | |
mkdirp('/tmp/foo/bar/baz', function (err) { | |
if (err) console.error(err) | |
else console.log('pow!') | |
}); | |
----------------- | |
Salticidae Usage: | |
----------------- | |
spider( 'http://some.url/', { | |
key: $ => $( '.some-selector' ).find( '.to #return' ).text(), | |
} ).then( ( results ) => { | |
console.log( results ); | |
// { key: 'Whatever the text of that node was' } | |
} ); | |
--------------- | |
Turndown Usage: | |
--------------- | |
// For Node.js | |
var TurndownService = require('turndown') | |
var turndownService = new TurndownService() | |
var markdown = turndownService.turndown('<h1>Hello world!</h1>') | |
*/ | |
/* eslint-disable no-unused-vars */ | |
const { join } = require( 'path' ); | |
const { spider } = require( 'salticidae' ); | |
const fs = require( 'fs' ); | |
const https = require( 'https' ); | |
const mkdirp = require( 'mkdirp' ); | |
const rimraf = require( 'rimraf' ); | |
const TurndownService = require( 'turndown' ); | |
const turndownService = new TurndownService(); | |
const turndown = ( html ) => turndownService.turndown( html ); | |
/** | |
* Configure verbosity of logging. | |
*/ | |
const VERBOSE = false; | |
/** | |
* console.log wrapper that does nothing when VERBOSE is false. | |
*/ | |
function log() { | |
if ( ! VERBOSE ) { | |
return; | |
} | |
console.log.apply( console, arguments ); | |
} | |
/** Entrypoint to Gutenberg handbook. */ | |
const HANDBOOK_URI = 'https://wordpress.org/gutenberg/handbook/'; | |
/** Output directory. */ | |
const handbookRoot = join( process.cwd(), 'handbook' ); | |
/** | |
* Take a handbook URI and get its relative path within the handbook root. | |
* | |
* @param {String} uri A web URL for a handbook page | |
*/ | |
const getSubpath = ( uri ) => uri | |
// Get the path relative to the handbook URI root | |
.replace( /^.*gutenberg\/handbook\//, '' ) | |
// Strip the last part of the URL, since that becomes the filename | |
.replace( /\/[^/]+\/$/, '' ); | |
/** | |
* Convert the last part of the handbook page URI into a markdown filename. | |
* | |
* @param {String} uri A web URL for a handbook page | |
*/ | |
const getFilename = ( uri ) => `${ | |
uri.trim().split( '/' ).filter( Boolean ).pop() | |
}.md`; | |
const imgFilename = ( imageURI ) => imageURI.split( '/' ).pop(); | |
/** Useful DOM query selectors. */ | |
const selectors = { | |
menu: '.menu-table-of-contents-container', | |
title: '.entry-header', | |
content: '.entry-content', | |
}; | |
/** | |
* Delete & re-create the handbook root. | |
*/ | |
const emptyHandbookRoot = () => new Promise( ( resolve, reject ) => { | |
rimraf( handbookRoot, ( err ) => { | |
if ( err ) { | |
return reject( err ); | |
} | |
mkdirp( handbookRoot, ( err ) => { | |
if ( err ) { | |
return reject( err ); | |
} | |
resolve(); | |
} ); | |
} ); | |
} ); | |
/** | |
* Ensure a path exists on disk. | |
* | |
* @param {String} path An absolute file system path | |
* @return Promise | |
*/ | |
const ensureExists = ( path ) => new Promise( ( resolve, reject ) => { | |
log( `Ensuring path ${ path }` ); | |
mkdirp( path, ( err ) => { | |
if ( err ) { | |
reject( err ); | |
} else { | |
resolve(); | |
} | |
} ); | |
} ); | |
/** | |
* Write a string to a file on disk. | |
* | |
* @param {String} outputFile An absolute filesystem path to the output file | |
* @param {String} content The file content to write | |
* @return Promise | |
*/ | |
const write = ( filePath, content ) => new Promise( ( resolve, reject ) => { | |
fs.writeFile( filePath, content, { | |
encoding: 'utf8', | |
}, ( err ) => { | |
if ( err ) { | |
return reject( err ); | |
} | |
console.log( `Wrote ${ filePath } to disk` ); | |
resolve( content ); | |
} ); | |
} ); | |
/** | |
* Download a file to disk. | |
* | |
* @param {String} filePath An absolute path to which to save the file. | |
* @param {String} fileURI A remote file URI. | |
* @return Promise | |
*/ | |
const download = ( filePath, fileURI ) => new Promise( ( resolve, reject ) => { | |
log( `Downloading ${ fileURI }\n to ${ filePath }` ); | |
const writeToDisk = fs.createWriteStream( filePath ); | |
https | |
.get( fileURI, ( response ) => response.pipe( writeToDisk ) ) | |
.on( 'finish', resolve ) | |
.on( 'error', reject ); | |
console.log( `Saved ${ filePath } to disk` ); | |
} ); | |
/** | |
* Write a markdown file to disk. | |
* | |
* @param {String} path An absolute filesystem path | |
* @param {String} filename A string filename | |
* @param {String} content The markdown content to write | |
* @return Promise | |
*/ | |
async function saveMarkdown( path, filename, content ) { | |
log( `Saving ${ content.length } chars to ${ join( path, filename ) }` ); | |
await ensureExists( path ); | |
return write( join( path, filename ), content ); | |
} | |
/** | |
* Download an image file into a handbook subdirectory. | |
* | |
* @param {String} subpath A relative directory within the handbook | |
* @param {String} imageURI A remote image URI | |
* @return Promise | |
*/ | |
async function saveImage( subpath, imageURI ) { | |
log( `Attempting to download ${ imageURI }` ); | |
const outputDir = join( handbookRoot, subpath ); | |
const filename = imageURI.split( '/' ).pop(); | |
const filePath = join( outputDir, filename ); | |
await ensureExists( outputDir ); | |
return download( filePath, imageURI ); | |
} | |
const replace = ( str, substr, replacement ) => { | |
const newStr = str.replace( substr, replacement ); | |
if ( str.indexOf( substr ) > -1 ) { | |
return replace( newStr, substr, replacement ); | |
} | |
return newStr; | |
}; | |
/** | |
* Download a Gutenberg handbook page as markdown and save any images. | |
* | |
* @param {String} uri A handbook URI. | |
*/ | |
async function downloadPage( uri ) { | |
log( `Downloading handbook page ${ uri }` ); | |
const data = await spider( uri, { | |
links: ( $ ) => $( selectors.menu ) | |
.find( 'a' ) | |
.map( ( i, el ) => $( el ).attr( 'href' ).trim() ) | |
.get(), | |
title: ( $ ) => $( selectors.title ) | |
.text() | |
.trim(), | |
content: ( $ ) => $( selectors.content ) | |
.html() | |
.trim(), | |
images: ( $ ) => $( selectors.content ) | |
.find( 'img' ) | |
.map( ( i, el ) => $( el ).attr( 'src' ).trim() ) | |
.get(), | |
} ); | |
const subpath = getSubpath( uri ); | |
// Special handling for the root handbook index page. | |
const filename = HANDBOOK_URI === uri ? | |
'index.md' : | |
getFilename( uri ); | |
const outputPath = subpath ? | |
join( handbookRoot, subpath ) : | |
handbookRoot; | |
if ( data.images.length ) { | |
console.log( ` Found ${ data.images.length } image${ data.images.length > 1 ? 's' : '' }` ); | |
} | |
// Download images | |
await data.images.reduce( async function( lastImageDownloaded, imageURI ) { | |
await lastImageDownloaded; | |
return saveImage( subpath, imageURI ); | |
}, Promise.resolve() ); | |
// Relative image links | |
let markdown = data.images.reduce( ( markdown, imageURI ) => ( | |
replace( markdown, imageURI, `./${ imgFilename( imageURI ) }` ) | |
), turndown( data.content ) ); | |
// Relative intra-handbook links | |
markdown = data.links.reduce( ( markdown, uri ) => ( | |
replace( markdown, uri, uri.replace( /^.*gutenberg\/handbook/, '' ) ) | |
), markdown ); | |
// Heading | |
markdown = `# ${ data.title }\n\n${ markdown }`; | |
// Jekyll front matter | |
markdown = `---\ntitle: ${ data.title }\n---\n\n${ markdown }`; | |
await saveMarkdown( outputPath, filename, markdown ); | |
return data.links; | |
} | |
async function wait( delay ) { | |
return await new Promise( ( resolve ) => setTimeout( resolve, delay ) ); | |
} | |
// Async IIFE to kick of script | |
void async function() { | |
await emptyHandbookRoot(); | |
const links = await downloadPage( HANDBOOK_URI ); | |
const handbookPages = links.filter( ( link ) => link !== HANDBOOK_URI ); | |
await handbookPages.reduce( async function( lastPageComplete, uri ) { | |
await lastPageComplete; | |
await wait( 200 ); | |
await downloadPage( uri ); | |
}, Promise.resolve() ); | |
console.log( '\nDownload complete!' ); | |
}(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment