Created
May 5, 2018 19:18
-
-
Save BrunoWinck/3a2f7ba1708b8b46e830a1d4bd7c31b9 to your computer and use it in GitHub Desktop.
NodeJS Script derived from Cogdog storify extractor https://github.com/cogdog/storify-extractor, no dependency
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Derived from https://github.com/cogdog/storify-extractor | |
| // Same License | |
| // Usage: node recup-storify.js <url of story> | |
| // will save a file <story name>.html | |
| function getAllMatches(regex, text) { | |
| // ----h/t https://stackoverflow.com/a/29999424/2418186 | |
| if (regex.constructor !== RegExp) { | |
| throw new Error('not RegExp'); | |
| } | |
| var res = []; | |
| var match = null; | |
| if (regex.global) { | |
| while (match = regex.exec(text)) { | |
| res.push(match); | |
| } | |
| } | |
| else { | |
| if (match = regex.exec(text)) { | |
| res.push(match); | |
| } | |
| } | |
| return res; | |
| } | |
| let http = require('https'); | |
| let fs = require( 'fs'); | |
| // console.log( "getting for ", process.argv[ 2]); | |
| // our raw input. yum. | |
| // var raw = fs.readFileSync("/dev/stdin", "utf-8"); | |
| let source = process.argv[ 2] + ".html"; | |
| var raw = http.get( source, (res) => { | |
| const { statusCode } = res; | |
| const contentType = res.headers['content-type']; | |
| let error; | |
| if (statusCode !== 200) { | |
| error = new Error('Request Failed.\n' + | |
| `Status Code: ${statusCode}`); | |
| } else if (!/^text\/html/.test(contentType)) { | |
| error = new Error('Invalid content-type.\n' + | |
| `Expected application/json but received ${contentType}`); | |
| } | |
| if (error) { | |
| console.error(error.message); | |
| // consume response data to free up memory | |
| res.resume(); | |
| return; | |
| } | |
| res.setEncoding('utf8'); | |
| let rawData = ''; | |
| res.on('data', (chunk) => { rawData += chunk; }); | |
| res.on('end', () => { | |
| try { | |
| var raw = rawData; | |
| // make sure we replace any urls that start with "//" with "http://" | |
| raw = raw.replace(/data-permalink="\/\//g,"data-permalink=\"http://"); | |
| // add some new lines after list items so it's easier to search | |
| raw = raw.replace(/<\/div><\/div><\/li>/g,"</div></div></li>\n"); | |
| // the pattern that finds all storify links worth extracting | |
| var regex = /data-permalink=\"(https?:\/\/[^\s]+)\"/g; | |
| // go grep it | |
| var res = getAllMatches(regex, raw); | |
| // holder for results | |
| var urls = []; | |
| // counters | |
| var linkcount = 0; | |
| var textcount = 0; | |
| // walk through the matches | |
| res.forEach(function (item) { | |
| // we want to skip any elements that are storify internals (text, headings) | |
| if ( item[0].indexOf('storify.com') == -1) { | |
| // chop the trailing quote and take everything after data-permalink=" | |
| urls.push(item[0].slice(0, -1).substring(16)); | |
| linkcount++; | |
| } else if (item[0] != 'data-permalink="https://storify.com/"') { | |
| // This is storify text content, skip ones that point only to storify, they are junk | |
| // create a regex that starts with the match pattern and captures the text in div | |
| var storyifyregex = new RegExp(item[0] + '(.*)emojify">(.*)<\/div><\/div><\/li>', 'i'); | |
| // run match on whole thing to find this line | |
| storify_str = raw.match(storyifyregex); | |
| // push the text, it is second item in match | |
| urls.push( storify_str[2] ); | |
| textcount++; | |
| } | |
| }); | |
| // insert the results, put double link breaks between them | |
| // console.log( urls.join("\n\n")); | |
| let parts = source.split('/'); | |
| let lastpart = parts[ parts.length-1]; | |
| let parts2 = lastpart.split('?'); | |
| let woQueryString = parts2[0]; | |
| let parts3 = woQueryString.split('#'); | |
| let outname = parts3[0]; | |
| fs.writeFileSync( outname, urls.join("\n\n"), "utf-8"); | |
| } catch (e) { | |
| console.error(e.message); | |
| } | |
| }); | |
| }).on('error', (e) => { | |
| console.error(`Got error: ${e.message}`); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment