Created
September 15, 2019 20:43
-
-
Save riccardogiorato/85d340b62a196eb70c1da500789cf402 to your computer and use it in GitHub Desktop.
headless-chrome-crawler example bot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const HCCrawler = require('headless-chrome-crawler'); | |
console.log('Starting the fetch'); | |
const singlePage = false; | |
let maxDepthCrawler = 6; | |
if(singlePage) | |
maxDepthCrawler=1; | |
const makeupPageUrl = 'http://scrapoxy.io/'; | |
(async () => { | |
const crawler = await HCCrawler.launch({ | |
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', | |
//headless: false, | |
//slowMo: 10, | |
evaluatePage: () => ({ | |
title: $('title').text(), | |
//ADD here your other elements to parse from jQuery selectors | |
}), | |
onSuccess: (result) => { | |
const results = result.result; | |
console.log(`PRODUCT - ${results.title}.`); | |
//ACCESS here the elements evaluted inside the page from the previous section, the name have to be the same | |
} | |
}); | |
await crawler.queue({ | |
url: makeupPageUrl, | |
maxDepth: maxDepthCrawler, | |
depthPriority: false, | |
allowedDomains: [/sephora\.it$/], | |
}); | |
await crawler.onIdle(); | |
await crawler.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment