Skip to content

Instantly share code, notes, and snippets.

@BrunoWinck
Created May 5, 2018 19:18
Show Gist options
  • Select an option

  • Save BrunoWinck/3a2f7ba1708b8b46e830a1d4bd7c31b9 to your computer and use it in GitHub Desktop.

Select an option

Save BrunoWinck/3a2f7ba1708b8b46e830a1d4bd7c31b9 to your computer and use it in GitHub Desktop.
NodeJS Script derived from Cogdog storify extractor https://github.com/cogdog/storify-extractor, no dependency
// Derived from https://github.com/cogdog/storify-extractor
// Same License
// Usage: node recup-storify.js <url of story>
// will save a file <story name>.html
function getAllMatches(regex, text) {
// ----h/t https://stackoverflow.com/a/29999424/2418186
if (regex.constructor !== RegExp) {
throw new Error('not RegExp');
}
var res = [];
var match = null;
if (regex.global) {
while (match = regex.exec(text)) {
res.push(match);
}
}
else {
if (match = regex.exec(text)) {
res.push(match);
}
}
return res;
}
let http = require('https');
let fs = require( 'fs');
// console.log( "getting for ", process.argv[ 2]);
// our raw input. yum.
// var raw = fs.readFileSync("/dev/stdin", "utf-8");
let source = process.argv[ 2] + ".html";
var raw = http.get( source, (res) => {
const { statusCode } = res;
const contentType = res.headers['content-type'];
let error;
if (statusCode !== 200) {
error = new Error('Request Failed.\n' +
`Status Code: ${statusCode}`);
} else if (!/^text\/html/.test(contentType)) {
error = new Error('Invalid content-type.\n' +
`Expected application/json but received ${contentType}`);
}
if (error) {
console.error(error.message);
// consume response data to free up memory
res.resume();
return;
}
res.setEncoding('utf8');
let rawData = '';
res.on('data', (chunk) => { rawData += chunk; });
res.on('end', () => {
try {
var raw = rawData;
// make sure we replace any urls that start with "//" with "http://"
raw = raw.replace(/data-permalink="\/\//g,"data-permalink=\"http://");
// add some new lines after list items so it's easier to search
raw = raw.replace(/<\/div><\/div><\/li>/g,"</div></div></li>\n");
// the pattern that finds all storify links worth extracting
var regex = /data-permalink=\"(https?:\/\/[^\s]+)\"/g;
// go grep it
var res = getAllMatches(regex, raw);
// holder for results
var urls = [];
// counters
var linkcount = 0;
var textcount = 0;
// walk through the matches
res.forEach(function (item) {
// we want to skip any elements that are storify internals (text, headings)
if ( item[0].indexOf('storify.com') == -1) {
// chop the trailing quote and take everything after data-permalink="
urls.push(item[0].slice(0, -1).substring(16));
linkcount++;
} else if (item[0] != 'data-permalink="https://storify.com/"') {
// This is storify text content, skip ones that point only to storify, they are junk
// create a regex that starts with the match pattern and captures the text in div
var storyifyregex = new RegExp(item[0] + '(.*)emojify">(.*)<\/div><\/div><\/li>', 'i');
// run match on whole thing to find this line
storify_str = raw.match(storyifyregex);
// push the text, it is second item in match
urls.push( storify_str[2] );
textcount++;
}
});
// insert the results, put double link breaks between them
// console.log( urls.join("\n\n"));
let parts = source.split('/');
let lastpart = parts[ parts.length-1];
let parts2 = lastpart.split('?');
let woQueryString = parts2[0];
let parts3 = woQueryString.split('#');
let outname = parts3[0];
fs.writeFileSync( outname, urls.join("\n\n"), "utf-8");
} catch (e) {
console.error(e.message);
}
});
}).on('error', (e) => {
console.error(`Got error: ${e.message}`);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment