Created
February 9, 2021 10:58
-
-
Save saulofilho/6ad3e7bee66b40005f77a340fb916105 to your computer and use it in GitHub Desktop.
Scrapper with Puppeteer, NodeJS and MongoDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const Product = require('./../app/models/Product'); | |
const scrapper = () => { | |
(async () => { | |
const extractData = async () => { | |
const url = 'https://world.openfoodfacts.org/'; | |
const page = await browser.newPage(); | |
await page.goto(url); | |
const urlsAnchor = await page.evaluate(() => { | |
const nodeList = document.querySelectorAll('.list_product_a'); | |
const array = [...nodeList]; | |
const pageHref = array.map(el => el.href); | |
return pageHref; | |
}); | |
// Loop through each of those links, open a new page instance and get the relevant data from them | |
let pagePromise = link => | |
new Promise(async (resolve, reject) => { | |
let dataObj = {}; | |
let newPage = await browser.newPage(); | |
await newPage.goto(link); | |
dataObj['code'] = await newPage.$eval('#barcode', el => el.innerText); | |
dataObj['barcode'] = await newPage.$eval('#barcode_paragraph', el => | |
el.innerText.replace('Barcode: ', '') | |
); | |
dataObj['status'] = 'imported'; | |
dataObj['imported_t'] = new Date(); | |
dataObj['url'] = link; | |
dataObj['product_name'] = await newPage.$eval( | |
'h1[property="food:name"]', | |
el => el.innerText | |
); | |
dataObj['quantity'] = await newPage.evaluate(() => { | |
const nodeList = document.querySelectorAll( | |
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > span' | |
); | |
const array = [...nodeList]; | |
const arrayQuantity = array.map(el => { | |
if (el.innerHTML.includes('Quantity:')) { | |
return el.parentElement.innerText.replace('Quantity: ', ''); | |
} | |
}); | |
const removeNull = arrayQuantity.filter(x => x).join(', '); | |
return removeNull; | |
}); | |
dataObj['categories'] = await newPage.evaluate(() => { | |
const nodeList = document.querySelectorAll( | |
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a' | |
); | |
const array = [...nodeList]; | |
const arrayCategories = array.map(el => { | |
if (el.href.includes('category')) { | |
return el.innerHTML; | |
} | |
}); | |
const removeNull = arrayCategories.filter(x => x).join(', '); | |
return removeNull; | |
}); | |
dataObj['packaging'] = await newPage.evaluate(() => { | |
const nodeList = document.querySelectorAll( | |
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a' | |
); | |
const array = [...nodeList]; | |
const arrayPackaging = array.map(el => { | |
if (el.href.includes('packaging')) { | |
return el.innerHTML; | |
} | |
}); | |
const removeNull = arrayPackaging.filter(x => x).join(', '); | |
return removeNull; | |
}); | |
dataObj['brands'] = await newPage.evaluate(() => { | |
const nodeList = document.querySelectorAll( | |
'#main_column > div > div > div.medium-12.large-8.xlarge-8.xxlarge-8.columns > p > a[itemprop="brand"]' | |
); | |
const array = [...nodeList]; | |
const itemValue = array.map(el => el.innerHTML); | |
const removeNull = itemValue.filter(x => x).join(', '); | |
return removeNull; | |
}); | |
dataObj['image_url'] = await newPage.$eval('#og_image', el => el.src); | |
resolve(dataObj); | |
await newPage.close(); | |
}); | |
for (link in urlsAnchor) { | |
const currentPageData = await pagePromise(urlsAnchor[link]); | |
const saveDataBase = new Product(currentPageData); | |
saveDataBase | |
.save() | |
.then(() => { | |
console.log('ok'); | |
}) | |
.catch(err => { | |
console.log(err); | |
}); | |
console.log('Data', saveDataBase); | |
} | |
}; | |
const browser = await puppeteer.launch({ | |
headless: true, | |
args: ['--disable-setuid-sandbox'], | |
ignoreHTTPSErrors: true, | |
}); | |
extractData(); | |
})(); | |
}; | |
module.exports = scrapper(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment