Skip to content

Instantly share code, notes, and snippets.

@koladilip
Last active June 11, 2023 23:58
Show Gist options
  • Save koladilip/699a5f1d2ad17f60a294c1223af88465 to your computer and use it in GitHub Desktop.
Save koladilip/699a5f1d2ad17f60a294c1223af88465 to your computer and use it in GitHub Desktop.
Scrap a webpage with Infinite scrolling using phantomjs
const phantom = require('phantom');
async function wait(timeInMills) {
return new Promise((resolve) => {
setTimeout(() => {
resolve();
}, timeInMills);
});
}
// Scrolls the page till new content is available
async function scrollPage(page) {
const currentContentLength = (await page.property('content')).length;
await page.evaluate(function () {
window.document.body.scrollTop = document.body.scrollHeight;
});
await wait(Math.max(5000, 10000 * Math.random()));
const nextContentLength = (await page.property('content')).length;
if (currentContentLength != nextContentLength) {
console.log("Scrolling page:", await page.property('url'), "for more content");
await scrollPage(page);
}
}
// Scrolls the page and gets the page content using PhantomJS
async function getPageData(pageUrl, shouldScrollPage) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.open(pageUrl);
if (shouldScrollPage) {
await scrollPage(page);
}
const pageContent = await page.property('content');
await page.close();
await instance.exit();
return pageContent;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment