Last active
January 23, 2019 19:45
-
-
Save petulla/40e88cf5dcf9cfdcbe253e281dfa925f to your computer and use it in GitHub Desktop.
Generic Promise.all + Axios scraper for fetching html content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const io = require('indian-ocean'); | |
const cheerio = require('cheerio'); | |
const axios = require('axios'); | |
// config | |
const fileName = 'kirk.csv'; | |
const selector = '.row > div > p'; | |
// url generation | |
const months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | |
const years = [2016, 2017, 2018]; | |
const urls = years.map(y => months.map( x => `http://www.visualisingdata.com/${y}/${x}/`)).reduce((a, v) => a.concat(v)); | |
const errLog = (err) => console.log(err.message); | |
const errs = (err) => console.log(err); | |
const gatherResults = async (res) => { | |
const $ = cheerio.load(res); | |
const data = []; | |
$(selector).each((i, elem) => { | |
data.push({ | |
text: $(elem).text() | |
}); | |
}); | |
return data; | |
}; | |
const gatherBlog = async (page) => { | |
return await axios.get(page) | |
.then(res => res.status === 200 ? gatherResults(res.data) : []) | |
.catch(errs); | |
}; | |
const write = (flatJson) => io.writeData(fileName, flatJson, {}, errs); | |
(async function scrape() { | |
// scrape | |
const datasets = await Promise.all(urls.map(url => gatherBlog(url))).catch(errLog); | |
// flatten | |
const data = [].concat.apply([], datasets); | |
// write | |
write(data); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment