Last active
January 26, 2017 22:56
-
-
Save simonswiss/2fe2ff41cd4a946a0e8a11f971b79732 to your computer and use it in GitHub Desktop.
Hipster Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const axios = require('axios') | |
const cheerio = require('cheerio') | |
const URLS = require('./urls') | |
const TARGET_FILE = './data.js' | |
function scrapeData(url) { | |
axios.get(url).then(response => { | |
// each page may have multiple entries, we'll push then into this array | |
const entries = [] | |
// cheerio setup | |
const HTML = response.data | |
let $ = cheerio.load(HTML) | |
/* | |
################################# | |
##### Custom Business Logic ##### | |
################################# */ | |
const review = $('.review') | |
review.each( function(index, entry) { | |
// grab the values of the review and map them to Craft field names | |
entries.push({ | |
title: $(this).find('.review-author h6 a').text().trim(), | |
reviewTitle: $(this).find('.review-content h3').text(), | |
reviewText: $(this).find('.review-overall').text().trim(), | |
reviewAvatar: $(this).find('.review-author img').attr('src'), | |
reviewDate: $(this).find('.rating-md p meta[itemprop="datePublished"]').attr('content'), | |
reviewRating: parseInt($(this).find('.rating-md span').text().trim().substring(0,1)), | |
reviewBestRating: 5 | |
}) | |
}) | |
/* ################################## */ | |
// push all the page entries into our data array | |
data.push(...entries) | |
// log update to terminal | |
console.info(`Scraped ${entries.length} entries from ${url}`) | |
// call next iteration of the generator | |
const nextItem = runLoop.next(data) | |
// when generator has finished iterating | |
if ( nextItem.done === true ) { | |
// Write JSON data to file.. | |
fs.writeFile(TARGET_FILE, JSON.stringify(data, null, 2), (err) => { | |
if(err) { return console.log(err); } | |
console.info(`########## | |
${data.length} entries written to "${TARGET_FILE}"! | |
##########` | |
) | |
}) | |
} | |
}) | |
} | |
// The Generator... | |
function* loop() { | |
for (const url of URLS) { | |
yield scrapeData(url) | |
} | |
} | |
// Kicking off the loop with an empty data array! | |
const runLoop = loop() | |
runLoop.next(data = []) |
At line 50 I check if the done
value of the generator is true
, which only happens when the iterator has gone through its last yield
. At that point, i can trigger the fs.writeFile()
.
runLoop.next()
only calls the first yield
, but then you can see that the scrapeData()
function calls .next()
for each URL, on line 47, so effectively the generator will go through all indexes of the array! 👌
This is great! Love it. Thanks Simon
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
But you're not getting the option to output something once everything is done like with
Promise.all
right?Also calling
runLoop.next()
once will only iterate through the first url no?