Created
March 23, 2020 07:22
-
-
Save pnhuyduy/c507e94e9309511264713dc91b8dfee5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Source: https://intoli.com/blog/scrape-infinite-scroll/ | |
const fs = require('fs'); | |
const puppeteer = require('puppeteer'); | |
function extractItems() { | |
const extractedElements = document.querySelectorAll('main > div > div:nth-child(4) > article > div > div > div img'); | |
const items = []; | |
for (let element of extractedElements) { | |
items.push(element.src); | |
} | |
console.log(items); | |
return items; | |
} | |
async function scrapeInfiniteScrollItems( | |
page, | |
extractItems, | |
itemTargetCount, | |
scrollDelay = 1000, | |
) { | |
let items = []; | |
try { | |
let previousHeight; | |
while (items.length < itemTargetCount) { | |
items = await page.evaluate(extractItems); | |
previousHeight = await page.evaluate('document.body.scrollHeight'); | |
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); | |
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`); | |
await page.waitFor(scrollDelay); | |
} | |
} catch(e) { } | |
return items; | |
} | |
(async () => { | |
// Set up browser and page. | |
const browser = await puppeteer.launch({ | |
headless: false, | |
args: ['--no-sandbox', '--disable-setuid-sandbox'], | |
}); | |
const page = await browser.newPage(); | |
page.setViewport({ width: 1280, height: 926 }); | |
// Navigate to the demo page. | |
await page.goto('https://www.instagram.com/diq.ng/'); | |
// Scroll and extract items from the page. | |
const items = await scrapeInfiniteScrollItems(page, extractItems, 100); | |
// Save extracted items to a file. | |
fs.writeFileSync('./items.txt', items.join('\n') + '\n'); | |
// Close the browser. | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment