Skip to content

Instantly share code, notes, and snippets.

@missinglink
Created July 23, 2014 12:11
Show Gist options
  • Select an option

  • Save missinglink/72bc85c57c34e53e0c09 to your computer and use it in GitHub Desktop.

Select an option

Save missinglink/72bc85c57c34e53e0c09 to your computer and use it in GitHub Desktop.
Crawl es documentation for curl syntax errors
/** Crawl documentation for curl syntax errors **/
// Requires nodejs and npm
// @see: https://github.com/isaacs/nave for easy installation
// @install nodejs: [sudo] bash nave.sh usemain stable
// @ref: https://github.com/missinglink/huntsman
// Usage:
// $> npm install huntsman
// $> node esdocs.js
var huntsman = require('huntsman');
var spider = huntsman.spider();
var guideRegex = /https?:\/\/www\.elasticsearch\.org\/guide\/en.*/i;
spider.extensions = [
// load recurse extension & follow anchor links
huntsman.extension( 'recurse', {
pattern: {
filter: guideRegex // only crawl guide pages
},
// don't normalise url (remove trailing slash); because apache
normaliser: function ( uri ) { return uri; }
}),
huntsman.extension( 'cheerio' ), // load cheerio extension
huntsman.extension( 'stats' ) // load stats extension
];
// target only guide pages
spider.on( guideRegex, function ( err, res ){
if( !res.extension.cheerio ) return; // content is not html
var $ = res.extension.cheerio;
// check for curl example errors
$('pre.programlisting').each( function( i, el ){
var script = $(el).text();
var lines = script.split('\n');
var curlLine = false;
lines.forEach( function( line, i ){
if( line.match( /\s*curl\s*-X/ ) ) curlLine = i;
});
if( false !== curlLine ){
// valid get
if( lines[curlLine].match( /curl\s*-X(GET|DELETE)\s*.*/ ) ) {
// console.log( 'valid GET/DELETE' );
}
else if( lines[curlLine].match( /curl\s*-X(PUT|POST)\s*'?[^\s]*'?\s*-d\s*/ ) ) {
// console.log( 'valid PUT/POST' );
var jsonText = script.replace( /\$?\s*curl\s*-X(PUT|POST)\s*'?[^\s]*'?\s*-d\s*/, '' ).trim();
if( jsonText.substr( -1 ).match(";") ){
jsonText = jsonText.substr( 0, jsonText.length-1 );
}
else {
console.warn( 'missing semi-colon' );
}
if( !jsonText.substr( 0, 1 ).match("'") || !jsonText.substr( -1 ).match("'") ){
console.error( 'invalid json quoting' );
}
jsonText = jsonText.substr( 1, jsonText.length-2 );
try { JSON.parse( jsonText ); }
catch( e ){
console.error( 'invalid json', e );
console.error( jsonText );
}
}
else {
console.error( 'invalid/unknown curl format' );
console.error( script );
}
}
});
});
spider.queue.add( 'http://www.elasticsearch.org/guide/en/elasticsearch' );
spider.start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment