Last active
September 2, 2021 11:32
-
-
Save pars3c/a821666ae81200ec288588a14f6bc973 to your computer and use it in GitHub Desktop.
Coolblue crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Apify = require('apify'); | |
const fs = require('fs'); | |
Apify.main(async () => { | |
// Create a requestQueue | |
const requestQueue = await Apify.openRequestQueue(); | |
// Add the first requests to the queue | |
await requestQueue.addRequest({ url: 'https://www.coolblue.nl/sitemap/nl_en/products_1.xml' }); | |
await requestQueue.addRequest({ url: 'https://www.coolblue.nl/sitemap/nl_en/products_2.xml' }); | |
const crawler = new Apify.PuppeteerCrawler({ | |
requestQueue, | |
handlePageFunction: async ({ page, request }) => { | |
// Wait 10 seconds per request | |
await Apify.utils.sleep(10000); | |
console.log(`Processing ${request.url}...`); | |
/* | |
If we are dealing with a url that ends with ".xml" we treat it, | |
as a sitemap request and add all its urls to the requestQueue | |
*/ | |
if (request.url.endsWith('.xml')) { | |
const urls = await page.evaluate(() => Array.from( | |
document.querySelectorAll('sitemap loc, urlset url loc'), | |
element => element.textContent)); | |
urls.forEach(url => { | |
requestQueue.addRequest({ url }); | |
}); | |
} | |
/* | |
If the url is does not end with ".xml" it means we are dealing with, | |
a product page. | |
So we collect its name, price, url and append it to "products.csv". | |
*/ | |
else { | |
const nameElement = await page.$("span.js-product-name"); | |
const name = await page.evaluate(nameElement => nameElement.textContent, nameElement); | |
const priceElement = await page.$("strong.sales-price__current"); | |
const price = await page.evaluate(priceElement => priceElement.textContent, priceElement); | |
fs.appendFile('products.csv', `${name}, ${price}, ${request.url}\n`, function (err) { | |
if (err) throw err; | |
}); | |
} | |
},add time | |
maxConcurrency: 2, // Allow a maximum of 2 concurrent requests | |
}); | |
await crawler.run(); // Start the crawler | |
console.log('Done.'); // Notify us when the crawling is done | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment