BrunoWinck · May 5, 2018 19:18
diff --git a/recup-storify.js b/recup-storify.js
 // Derived from https://github.com/cogdog/storify-extractor
 // Same License
 // Usage: node recup-storify.js <url of story>
 // will save a file <story name>.html

 function getAllMatches(regex, text) {
 	// ----h/t https://stackoverflow.com/a/29999424/2418186
 	
    if (regex.constructor !== RegExp) {
        throw new Error('not RegExp');
    }
    var res = [];
    var match = null;
    if (regex.global) {
        while (match = regex.exec(text)) {
            res.push(match);
        }
    }
    else {
        if (match = regex.exec(text)) {
            res.push(match);
        }
    }
    return res;
 }

 let http = require('https');
 let fs = require( 'fs');
 // console.log( "getting for ", process.argv[ 2]);

        // our raw input. yum. 
 //        var raw = fs.readFileSync("/dev/stdin", "utf-8");    
 let source = process.argv[ 2] + ".html";
 var raw = http.get( source, (res) => {
        const { statusCode } = res;
        const contentType = res.headers['content-type'];
        
        let error;
        if (statusCode !== 200) {
                error = new Error('Request Failed.\n' +
                                `Status Code: ${statusCode}`);
        } else if (!/^text\/html/.test(contentType)) {
                error = new Error('Invalid content-type.\n' +
                                `Expected application/json but received ${contentType}`);
        }
        if (error) {
                console.error(error.message);
                // consume response data to free up memory
                res.resume();
                return;
        }
        
        res.setEncoding('utf8');
        let rawData = '';
        res.on('data', (chunk) => { rawData += chunk; });
        res.on('end', () => {
                try {

        var raw = rawData;

        // make sure we replace any urls that start with "//" with "http://"
        raw = raw.replace(/data-permalink="\/\//g,"data-permalink=\"http://");

        // add some new lines after list items so it's easier to search
        raw = raw.replace(/<\/div><\/div><\/li>/g,"</div></div></li>\n");

        // the pattern that finds all storify links worth extracting
        var regex = /data-permalink=\"(https?:\/\/[^\s]+)\"/g;

        // go grep it
        var res = getAllMatches(regex, raw);
        // holder for results
        var urls = [];

        // counters
        var linkcount = 0;
        var textcount = 0;

        // walk through the matches
        res.forEach(function (item) {

                // we want to skip any elements that are storify internals (text, headings)
                if ( item[0].indexOf('storify.com') == -1) {
                        // chop the trailing quote and take everything after data-permalink="
                        urls.push(item[0].slice(0, -1).substring(16));
                        linkcount++;
                        
                } else if (item[0] != 'data-permalink="https://storify.com/"') {
                        // This is storify text content, skip ones that point only to storify, they are junk
                        
                        // create a regex that starts with the match pattern and captures the text in div
                        var storyifyregex = new RegExp(item[0] + '(.*)emojify">(.*)<\/div><\/div><\/li>', 'i');
                        
                        // run match on whole thing to find this line
                        storify_str = raw.match(storyifyregex);
                        
                        // push the text, it is second item in match
                        urls.push( storify_str[2] );
                        
                        textcount++;
                }
        });

        // insert the results, put double link breaks between them

        // console.log( urls.join("\n\n"));
        let parts = source.split('/');
        let lastpart = parts[ parts.length-1];
        let parts2 = lastpart.split('?');
        let woQueryString = parts2[0];
        let parts3 = woQueryString.split('#');
        let outname = parts3[0];
        fs.writeFileSync( outname, urls.join("\n\n"), "utf-8"); 


        } catch (e) {
        console.error(e.message);
        }
        });
        }).on('error', (e) => {
        console.error(`Got error: ${e.message}`);
 });
	// Derived from https://github.com/cogdog/storify-extractor
	// Same License
	// Usage: node recup-storify.js <url of story>
	// will save a file <story name>.html

	function getAllMatches(regex, text) {
	// ----h/t https://stackoverflow.com/a/29999424/2418186

	if (regex.constructor !== RegExp) {
	throw new Error('not RegExp');
	}
	var res = [];
	var match = null;
	if (regex.global) {
	while (match = regex.exec(text)) {
	res.push(match);
	}
	}
	else {
	if (match = regex.exec(text)) {
	res.push(match);
	}
	}
	return res;
	}

	let http = require('https');
	let fs = require( 'fs');
	// console.log( "getting for ", process.argv[ 2]);

	// our raw input. yum.
	// var raw = fs.readFileSync("/dev/stdin", "utf-8");
	let source = process.argv[ 2] + ".html";
	var raw = http.get( source, (res) => {
	const { statusCode } = res;
	const contentType = res.headers['content-type'];

	let error;
	if (statusCode !== 200) {
	error = new Error('Request Failed.\n' +
	`Status Code: ${statusCode}`);
	} else if (!/^text\/html/.test(contentType)) {
	error = new Error('Invalid content-type.\n' +
	`Expected application/json but received ${contentType}`);
	}
	if (error) {
	console.error(error.message);
	// consume response data to free up memory
	res.resume();
	return;
	}

	res.setEncoding('utf8');
	let rawData = '';
	res.on('data', (chunk) => { rawData += chunk; });
	res.on('end', () => {
	try {

	var raw = rawData;

	// make sure we replace any urls that start with "//" with "http://"
	raw = raw.replace(/data-permalink="\/\//g,"data-permalink=\"http://");

	// add some new lines after list items so it's easier to search
	raw = raw.replace(/<\/div><\/div><\/li>/g,"</div></div></li>\n");

	// the pattern that finds all storify links worth extracting
	var regex = /data-permalink=\"(https?:\/\/[^\s]+)\"/g;

	// go grep it
	var res = getAllMatches(regex, raw);
	// holder for results
	var urls = [];

	// counters
	var linkcount = 0;
	var textcount = 0;

	// walk through the matches
	res.forEach(function (item) {

	// we want to skip any elements that are storify internals (text, headings)
	if ( item[0].indexOf('storify.com') == -1) {
	// chop the trailing quote and take everything after data-permalink="
	urls.push(item[0].slice(0, -1).substring(16));
	linkcount++;

	} else if (item[0] != 'data-permalink="https://storify.com/"') {
	// This is storify text content, skip ones that point only to storify, they are junk

	// create a regex that starts with the match pattern and captures the text in div
	var storyifyregex = new RegExp(item[0] + '(.)emojify">(.)<\/div><\/div><\/li>', 'i');

	// run match on whole thing to find this line
	storify_str = raw.match(storyifyregex);

	// push the text, it is second item in match
	urls.push( storify_str[2] );

	textcount++;
	}
	});

	// insert the results, put double link breaks between them

	// console.log( urls.join("\n\n"));
	let parts = source.split('/');
	let lastpart = parts[ parts.length-1];
	let parts2 = lastpart.split('?');
	let woQueryString = parts2[0];
	let parts3 = woQueryString.split('#');
	let outname = parts3[0];
	fs.writeFileSync( outname, urls.join("\n\n"), "utf-8");


	} catch (e) {
	console.error(e.message);
	}
	});
	}).on('error', (e) => {
	console.error(`Got error: ${e.message}`);
	});
No results found