Created
July 23, 2014 12:11
-
-
Save missinglink/72bc85c57c34e53e0c09 to your computer and use it in GitHub Desktop.
Crawl es documentation for curl syntax errors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** Crawl documentation for curl syntax errors **/ | |
| // Requires nodejs and npm | |
| // @see: https://github.com/isaacs/nave for easy installation | |
| // @install nodejs: [sudo] bash nave.sh usemain stable | |
| // @ref: https://github.com/missinglink/huntsman | |
| // Usage: | |
| // $> npm install huntsman | |
| // $> node esdocs.js | |
| var huntsman = require('huntsman'); | |
| var spider = huntsman.spider(); | |
| var guideRegex = /https?:\/\/www\.elasticsearch\.org\/guide\/en.*/i; | |
| spider.extensions = [ | |
| // load recurse extension & follow anchor links | |
| huntsman.extension( 'recurse', { | |
| pattern: { | |
| filter: guideRegex // only crawl guide pages | |
| }, | |
| // don't normalise url (remove trailing slash); because apache | |
| normaliser: function ( uri ) { return uri; } | |
| }), | |
| huntsman.extension( 'cheerio' ), // load cheerio extension | |
| huntsman.extension( 'stats' ) // load stats extension | |
| ]; | |
| // target only guide pages | |
| spider.on( guideRegex, function ( err, res ){ | |
| if( !res.extension.cheerio ) return; // content is not html | |
| var $ = res.extension.cheerio; | |
| // check for curl example errors | |
| $('pre.programlisting').each( function( i, el ){ | |
| var script = $(el).text(); | |
| var lines = script.split('\n'); | |
| var curlLine = false; | |
| lines.forEach( function( line, i ){ | |
| if( line.match( /\s*curl\s*-X/ ) ) curlLine = i; | |
| }); | |
| if( false !== curlLine ){ | |
| // valid get | |
| if( lines[curlLine].match( /curl\s*-X(GET|DELETE)\s*.*/ ) ) { | |
| // console.log( 'valid GET/DELETE' ); | |
| } | |
| else if( lines[curlLine].match( /curl\s*-X(PUT|POST)\s*'?[^\s]*'?\s*-d\s*/ ) ) { | |
| // console.log( 'valid PUT/POST' ); | |
| var jsonText = script.replace( /\$?\s*curl\s*-X(PUT|POST)\s*'?[^\s]*'?\s*-d\s*/, '' ).trim(); | |
| if( jsonText.substr( -1 ).match(";") ){ | |
| jsonText = jsonText.substr( 0, jsonText.length-1 ); | |
| } | |
| else { | |
| console.warn( 'missing semi-colon' ); | |
| } | |
| if( !jsonText.substr( 0, 1 ).match("'") || !jsonText.substr( -1 ).match("'") ){ | |
| console.error( 'invalid json quoting' ); | |
| } | |
| jsonText = jsonText.substr( 1, jsonText.length-2 ); | |
| try { JSON.parse( jsonText ); } | |
| catch( e ){ | |
| console.error( 'invalid json', e ); | |
| console.error( jsonText ); | |
| } | |
| } | |
| else { | |
| console.error( 'invalid/unknown curl format' ); | |
| console.error( script ); | |
| } | |
| } | |
| }); | |
| }); | |
| spider.queue.add( 'http://www.elasticsearch.org/guide/en/elasticsearch' ); | |
| spider.start(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment