Last active
August 10, 2025 14:46
-
-
Save chriscarrollsmith/1c0b9a2ee15d545d2d0a16b6e3733b91 to your computer and use it in GitHub Desktop.
Scrape the text of a tweet by URL with Puppeteer. Self-installs dependencies when run with bun. Add arbitrary metadata with --metadata-json flag.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bun | |
// @bun-dependencies: puppeteer@latest puppeteer-extra@latest puppeteer-extra-plugin-stealth@latest | |
import puppeteer from 'puppeteer'; | |
async function scrapeXTweet(url, metadata) { | |
// Launch browser with stealth options | |
const browser = await puppeteer.launch({ | |
headless: true, // or 'new' for new headless mode | |
executablePath: '/usr/bin/google-chrome', | |
args: [ | |
'--disable-blink-features=AutomationControlled', | |
'--disable-dev-shm-usage', | |
'--no-sandbox', | |
'--disable-setuid-sandbox', | |
'--disable-web-security', | |
'--disable-features=VizDisplayCompositor' | |
] | |
}); | |
const page = await browser.newPage(); | |
// Set a realistic viewport and user agent | |
await page.setViewport({ width: 1366, height: 768 }); | |
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); | |
try { | |
// Navigate to the provided URL | |
await page.goto(url, { | |
waitUntil: 'networkidle2', | |
timeout: 30000 | |
}); | |
// Wait for content to load | |
await page.waitForSelector('article', { timeout: 10000 }); | |
// Extract the tweet text | |
const tweetText = await page.evaluate(() => { | |
const tweetElement = document.querySelector('[data-testid="tweetText"]'); | |
return tweetElement ? tweetElement.textContent : 'Tweet not found'; | |
}); | |
const result = { id: url, content: tweetText }; | |
if (metadata && typeof metadata === 'object') { | |
result.metadata = metadata; | |
} | |
console.log(JSON.stringify(result)); | |
} catch (error) { | |
console.error('Error:', error.message); | |
} | |
await browser.close(); | |
} | |
// Parse CLI arguments | |
let url = undefined; | |
let metadata = undefined; | |
const argv = process.argv.slice(2); | |
for (let i = 0; i < argv.length; i += 1) { | |
const arg = argv[i]; | |
if (!arg.startsWith('-') && url === undefined) { | |
url = arg; | |
continue; | |
} | |
if (arg === '--metadata-json') { | |
const jsonArg = argv[i + 1]; | |
if (jsonArg === undefined) { | |
console.error('Error: --metadata-json requires a JSON argument'); | |
process.exit(1); | |
} | |
try { | |
metadata = JSON.parse(jsonArg); | |
} catch (e) { | |
console.error('Error: Failed to parse JSON provided to --metadata-json'); | |
process.exit(1); | |
} | |
i += 1; // Skip value | |
} | |
} | |
if (!url) { | |
console.error('Usage: ./scrape.js <tweet-url> [--metadata-json "{\\"key\\":\\"value\\"}"]'); | |
console.error('Example: ./scrape.js https://x.com/TimHannan/status/1891309104934633635 --metadata-json "{\\"reply_text\\":\\"Python is great!\\"}"'); | |
process.exit(1); | |
} | |
scrapeXTweet(url, metadata); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment