Last active
November 29, 2015 21:27
-
-
Save KittyGiraudel/e4ab93f034171999cb5b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Here is a very simple Node.js script to perform GET HTTP requests to a | |
// collection of similar pages (see `URLS`) in order to extract some information | |
// from the (virtual) DOM. For instance, you could parse an array of pages and | |
// grab the title of each of them. | |
// | |
// The script then writes the resulting data (an array as well, of course) in a | |
// JSON file (see `DEST_FILE`). It will also cache the requested HTML documents | |
// in a cache file (see `CACHE_FILE`) in order to avoid having to perform all | |
// the HTTP requests when only the collected data changes (see `parseData`). | |
import fs from 'fs-promise'; | |
import cheerio from 'cheerio'; | |
import request from 'request-promise'; | |
const CACHE_FILE = 'data.cache'; | |
const DEST_FILE = 'data.json'; | |
const URLS = [ | |
'http://example.com/foo', | |
'http://example.com/bar' | |
]; | |
// This function handles the data to collect. Keys of the returned object are | |
// the ones to be expected in the resulting JSON data file. Values of the | |
// returned object are jQuery-like expressions parsing the DOM to fetch some | |
// data. | |
const parseData = $ => ({ | |
title : $('h1.page-title').text(), | |
description : $('.main > p:first-of-type').html() | |
}); | |
// Read content from cache file and parse it to JSON if any. In case it fails, | |
// perform all the requests and cache the HTML of each document in the cache | |
// file. | |
// Then, iterate over the documents, load a DOM crawler (Cheerio), collect the | |
// data and finally write a JSON file containing the expected data. Just before | |
// exiting, log something. | |
fs.readFile(CACHE_FILE, { encoding: 'utf8' }) | |
.then(JSON.parse) | |
.catch(error => { | |
console.log(`Cache file ${CACHE_FILE} does not exist yet. Creating it.`); | |
return Promise.all(URLS.map(request)) | |
.then(doc => { | |
fs.writeFile(CACHE_FILE, JSON.stringify(doc)); | |
return doc; | |
}); | |
}) | |
.then(docs => docs.map(doc => parseData(cheerio.load(doc)))) | |
.then(data => fs.writeFile(DEST_FILE, JSON.stringify(data, null, 2))) | |
.then(() => console.log(`File ${DEST_FILE} correctly created.`)) | |
.catch(console.log); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment