Skip to content

Instantly share code, notes, and snippets.

@saulofilho
Created February 9, 2021 10:58
Show Gist options
  • Save saulofilho/6ad3e7bee66b40005f77a340fb916105 to your computer and use it in GitHub Desktop.
Save saulofilho/6ad3e7bee66b40005f77a340fb916105 to your computer and use it in GitHub Desktop.
Scrapper with Puppeteer, NodeJS and MongoDB
const puppeteer = require('puppeteer');
const Product = require('./../app/models/Product');
const scrapper = () => {
(async () => {
const extractData = async () => {
const url = 'https://world.openfoodfacts.org/';
const page = await browser.newPage();
await page.goto(url);
const urlsAnchor = await page.evaluate(() => {
const nodeList = document.querySelectorAll('.list_product_a');
const array = [...nodeList];
const pageHref = array.map(el => el.href);
return pageHref;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = link =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await newPage.goto(link);
dataObj['code'] = await newPage.$eval('#barcode', el => el.innerText);
dataObj['barcode'] = await newPage.$eval('#barcode_paragraph', el =>
el.innerText.replace('Barcode: ', '')
);
dataObj['status'] = 'imported';
dataObj['imported_t'] = new Date();
dataObj['url'] = link;
dataObj['product_name'] = await newPage.$eval(
'h1[property="food:name"]',
el => el.innerText
);
dataObj['quantity'] = await newPage.evaluate(() => {
const nodeList = document.querySelectorAll(
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > span'
);
const array = [...nodeList];
const arrayQuantity = array.map(el => {
if (el.innerHTML.includes('Quantity:')) {
return el.parentElement.innerText.replace('Quantity: ', '');
}
});
const removeNull = arrayQuantity.filter(x => x).join(', ');
return removeNull;
});
dataObj['categories'] = await newPage.evaluate(() => {
const nodeList = document.querySelectorAll(
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a'
);
const array = [...nodeList];
const arrayCategories = array.map(el => {
if (el.href.includes('category')) {
return el.innerHTML;
}
});
const removeNull = arrayCategories.filter(x => x).join(', ');
return removeNull;
});
dataObj['packaging'] = await newPage.evaluate(() => {
const nodeList = document.querySelectorAll(
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a'
);
const array = [...nodeList];
const arrayPackaging = array.map(el => {
if (el.href.includes('packaging')) {
return el.innerHTML;
}
});
const removeNull = arrayPackaging.filter(x => x).join(', ');
return removeNull;
});
dataObj['brands'] = await newPage.evaluate(() => {
const nodeList = document.querySelectorAll(
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a[itemprop="brand"]'
);
const array = [...nodeList];
const itemValue = array.map(el => el.innerHTML);
const removeNull = itemValue.filter(x => x).join(', ');
return removeNull;
});
dataObj['image_url'] = await newPage.$eval('#og_image', el => el.src);
resolve(dataObj);
await newPage.close();
});
for (link in urlsAnchor) {
const currentPageData = await pagePromise(urlsAnchor[link]);
const saveDataBase = new Product(currentPageData);
saveDataBase
.save()
.then(() => {
console.log('ok');
})
.catch(err => {
console.log(err);
});
console.log('Data', saveDataBase);
}
};
const browser = await puppeteer.launch({
headless: true,
args: ['--disable-setuid-sandbox'],
ignoreHTTPSErrors: true,
});
extractData();
})();
};
module.exports = scrapper();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment