Skip to content

Instantly share code, notes, and snippets.

@ntkog
Last active January 3, 2020 17:33
Show Gist options
  • Save ntkog/0912d63054ab70334b5a0530514f95b9 to your computer and use it in GitHub Desktop.
Save ntkog/0912d63054ab70334b5a0530514f95b9 to your computer and use it in GitHub Desktop.
Scraping Tweets from a user
const puppeteer = require('puppeteer');
const {writeFile} = require('jsonfile');
const TWITTER_USER = process.argv[2] || "congosto";
const TWEETS_TO_FETCH = process.argv[3] || 100;
function extractItems() {
return [...document.querySelectorAll('.tweet')]
.map(el => ({
metadata : {...el.dataset},
text : el.querySelector('.js-tweet-text-container').textContent.trim()
}));
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = Math.random() * (5000 - 1500) + 1500,
) {
var items = [];
try {
let previousHeight;
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
console.log(items.pop().text);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(`https://twitter.com/search?q=%40${TWITTER_USER}%20include%3Anativeretweets&src=typed_query`);
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, TWEETS_TO_FETCH);
// Save extracted items to a file.
writeFile('./items.json', items);
// Close the browser.
await browser.close();
})();
@Angelmmiguel
Copy link

Muy útil! Muchas gracias 😄

@ntkog
Copy link
Author

ntkog commented Jan 2, 2020

De nada! :-)

@ntkog
Copy link
Author

ntkog commented Jan 3, 2020

I've just put it in a repo :-)
twitter_scrap
Have fun!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment