Created
August 31, 2018 14:34
-
-
Save chrisheseltine/cbcf80ef15d9e54107ba31c61ab8ca09 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Apify = require('apify'); | |
Apify.main(async () => { | |
const requestQueue = await Apify.openRequestQueue(); | |
await requestQueue.addRequest(new Apify.Request({ | |
url: 'https://petharbor.com/results.asp?searchtype=ADOPT&start=1&miles=20&shelterlist=%27HAMP%27&zip=&where=type_CAT&nosuccess=1&nomax=1&rows=25&nobreedreq=1&nopod=1&nocustom=1&samaritans=1&view=sysadm.v_hamp&imgres=detail&stylesheet=https://cbbb1e2ef05c549bf4c2-7b792f487d9839572907a6863bac8ad2.ssl.cf5.rackcdn.com/petharbor.css&grid=1&NewOrderBy=Name&text=000000&link=007c0f&col_bg=ffffff', | |
userData: { | |
label: "roster" | |
} | |
})); | |
await requestQueue.addRequest(new Apify.Request({ | |
url: 'https://petharbor.com/results.asp?searchtype=ADOPT&start=1&miles=20&shelterlist=%27HAMP%27&zip=&where=type_DOG&nosuccess=1&nomax=1&rows=25&nobreedreq=1&nopod=1&nocustom=1&samaritans=1&view=sysadm.v_hamp&imgres=detail&stylesheet=https://cbbb1e2ef05c549bf4c2-7b792f487d9839572907a6863bac8ad2.ssl.cf5.rackcdn.com/petharbor.css&grid=1&NewOrderBy=Name&text=000000&link=007c0f&col_bg=ffffff', | |
userData: { | |
label: "roster" | |
} | |
})); | |
async function addDataLinksToQueue(page) { | |
const links = await page.$$eval('.GridResultsContainer a', el => el.href); | |
console.log('links: ${links}') | |
// enqueue the pages and give them label detail so you can distinguish between roster and data | |
for (let i = 0; i < links.length; i++) { | |
await requestQueue.addRequest(new Apify.Request({ | |
url: request.links[i], //? | |
userData: { | |
label: "data" | |
} | |
})); | |
console.log('dataLinkQueued'); | |
} | |
} | |
async function extractData(page) { | |
const id = await page.$eval('font', el => el.innerText); | |
return { id }; | |
} | |
const crawler = new Apify.PuppeteerCrawler({ | |
requestQueue, | |
handlePageFunction: async ({ page, request }) => { | |
// add data links to the queue on roster pages | |
if (request.userData.label === "roster") { | |
await addDataLinksToQueue(page); | |
} | |
// when reaching data pages extract the data | |
else if (request.userData.label === "data") { | |
const data = await extractData(page); | |
console.log('got data: ${data}'); | |
await Apify.pushData(data); | |
} | |
} | |
}); | |
crawler.run() | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment