Last active
October 1, 2023 01:57
-
-
Save m-esm/79d59b2f6a84f30a5ada23ad4abaf336 to your computer and use it in GitHub Desktop.
Extract tweets from browser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if (!window.tweets) window.tweets = {}; | |
/** | |
* @param tweetElem {HTMLElement} | |
*/ | |
window.parseTweetElement = (tweetElem) => { | |
const isQuote = !!Array.from(tweetElem.querySelectorAll("span")).find( | |
(p) => p.textContent === "Quote" | |
); | |
let quotedTweet; | |
if (isQuote) { | |
quotedTweetElem = Array.from(tweetElem.querySelectorAll("span")).find( | |
(p) => p.textContent === "Quote" | |
)?.parentElement.nextSibling; | |
if (quotedTweetElem) | |
quotedTweet = window.parseTweetElement(quotedTweetElem); | |
quotedTweetElem?.remove(); | |
} | |
const username = Array.from(tweetElem.querySelectorAll("span")) | |
.map((p) => p?.textContent) | |
.find((p) => p?.startsWith("@")); | |
const avatar = tweetElem | |
.querySelector('[data-testid="Tweet-User-Avatar"] img') | |
?.getAttribute("src"); | |
const text = tweetElem.querySelector( | |
'[data-testid="tweetText"]' | |
)?.textContent; | |
const time = tweetElem.querySelector("time")?.getAttribute("datetime"); | |
const link = tweetElem | |
.querySelector("time") | |
?.parentElement?.getAttribute("href"); | |
const isRetweet = !!tweetElem.querySelector('[data-testid="socialContext"]') | |
?.textContent; | |
const retweetedBy = tweetElem.querySelector( | |
'[data-testid="socialContext"] span span' | |
)?.textContent; | |
const images = Array.from( | |
tweetElem.querySelectorAll('[data-testid="tweetPhoto"] img') | |
) | |
.map((p) => p?.getAttribute("src")) | |
.filter((p) => p); | |
const tweet = { | |
username, | |
text, | |
avatar, | |
time, | |
link: link ? `https://twitter.com${link}` : "", | |
isRetweet, | |
retweetedBy, | |
isQuote, | |
quotedTweet, | |
images, | |
}; | |
return tweet; | |
}; | |
window.fetchTweets = () => { | |
document.querySelectorAll('[data-testid="tweet"]').forEach((tweetElem) => { | |
const tweet = window.parseTweetElement(tweetElem); | |
if (Object.values(tweet).find((p) => !p)) return; | |
if (!tweets[tweet.link]) tweets[tweet.link] = tweet; | |
}); | |
console.log(`Total tweets extracted: ${Object.keys(tweets).length}`); | |
}; | |
window.scrollAndExtract = async (scrollHeight, iterations) => { | |
for (let i = 0; i < iterations; i++) { | |
window.fetchTweets(); | |
window.scrollBy(0, scrollHeight); | |
await new Promise((resolve) => setTimeout(resolve, 3000)); | |
} | |
const jsonString = JSON.stringify(Object.values(tweets), null, 2); | |
// Create a Blob from the JSON string | |
const blob = new Blob([jsonString], { | |
type: "application/json", | |
}); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement("a"); | |
a.href = url; | |
a.target = "_blank"; | |
a.download = `tweets_${Date.now()}.json`; | |
document.body.appendChild(a); | |
a.click(); | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
}; | |
await scrollAndExtract(10000, 3); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example extracted tweet: