Created
January 23, 2024 16:43
-
-
Save 1337hero/284dd76d6f1e5fd7df0b3d6fc2cf5183 to your computer and use it in GitHub Desktop.
FB Email Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function createCSV(data, fileName) { | |
const headers = [ | |
'id', | |
'author_name', | |
'post', | |
'is_post', | |
'comment', | |
'is_comment', | |
'first_name', | |
'last_name', | |
'email', | |
] | |
const csvContent = [ | |
headers.join(','), | |
...data.map((row) => | |
headers | |
.map((header) => { | |
const value = row[header] | |
if (value === null) return 'null' | |
if (typeof value === 'string') { | |
// Wrap all fields, including those without commas, in double quotes | |
return `"${value.replace(/"/g, '""')}"` | |
} | |
return value | |
}) | |
.join(','), | |
), | |
].join('\n') | |
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }) | |
const link = document.createElement('a') | |
if (navigator.msSaveBlob) { | |
// IE 10+ | |
navigator.msSaveBlob(blob, fileName) | |
} else { | |
const url = URL.createObjectURL(blob) | |
link.setAttribute('href', url) | |
link.setAttribute('download', fileName || 'data.csv') | |
document.body.appendChild(link) | |
link.click() | |
document.body.removeChild(link) | |
URL.revokeObjectURL(url) | |
} | |
} | |
async function scrollDown() { | |
// const wrapper = document.querySelector("#search-page-list-container"); | |
const wrapper = window | |
await new Promise((resolve, reject) => { | |
var totalHeight = 0 | |
var distance = 2000 | |
var timer = setInterval(async () => { | |
var scrollHeightBefore = wrapper.scrollHeight | |
wrapper.scrollBy(0, distance) | |
totalHeight += distance | |
clearInterval(timer) | |
resolve() | |
}, 400) | |
}) | |
await new Promise((resolve) => setTimeout(resolve, 1000)) | |
} | |
function getEmailFromText(text) { | |
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g | |
const email = text?.match(emailRegex)?.[0] | |
return email || '' | |
} | |
function clickOnComments(post) { | |
// Get all divs on the page | |
var allDivs = post.getElementsByTagName('div') | |
// Create an array to store matching divs | |
var matchingDivs = [] | |
// Loop through each div | |
for (var i = 0; i < allDivs.length; i++) { | |
// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic" | |
if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') { | |
// Add the matching div to the array | |
matchingDivs.push(allDivs[i]) | |
const thingToClickToOpenComments = | |
allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0] | |
?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0] | |
?.children?.[0] | |
if (thingToClickToOpenComments) { | |
thingToClickToOpenComments.click() | |
} | |
} | |
} | |
} | |
// Function to recursively traverse HTML elements and return text in an array | |
function traverseElements(element) { | |
var textArray = [] | |
// Check if the element has child nodes | |
if (element.childNodes.length > 0) { | |
// Loop through each child node | |
for (var i = 0; i < element.childNodes.length; i++) { | |
// Recursively call the function for each child node | |
textArray = textArray.concat(traverseElements(element.childNodes[i])) | |
} | |
} else { | |
// If the element is a text node and contains non-whitespace text | |
if ( | |
element.nodeType === Node.TEXT_NODE && | |
element.nodeValue.trim() !== '' | |
) { | |
// Push the text into the text array | |
textArray.push(element.nodeValue.trim()) | |
} | |
} | |
return textArray | |
} | |
function getTextFromComment(textArray) { | |
return textArray | |
?.filter((section) => { | |
if (section === 'Reply') { | |
return false | |
} | |
if (section?.match(/^\d+$/)) { | |
return false | |
} | |
if (section === 'Like') { | |
return false | |
} | |
if (section === 'Top Contributor') { | |
return false | |
} | |
if (section === 'Follow') { | |
return false | |
} | |
if (section === '·') { | |
return false | |
} | |
return true | |
}) | |
?.slice(1, textArray.length - 3) | |
?.join(' ') | |
} | |
function extractComments(post = undefined) { | |
let parent = null | |
if (post) { | |
parent = post | |
} else { | |
const dialog = document?.querySelector('div[role=dialog]') | |
parent = dialog | |
} | |
if (!parent) { | |
return [] | |
} | |
var allDivs = parent.getElementsByTagName('div') | |
// Array to store extracted text arrays | |
var textArrays = [] | |
// Loop through each div | |
for (var i = 0; i < allDivs.length; i++) { | |
// Check if the div has the aria-label attribute starting with "Comment by" | |
var ariaLabel = allDivs[i].getAttribute('aria-label') | |
if (ariaLabel && ariaLabel.startsWith('Comment by')) { | |
// Call the recursive function to traverse and extract text | |
var elementTextArray = traverseElements(allDivs[i]) | |
// Push the text array to the top-level array | |
textArrays.push(elementTextArray) | |
} | |
} | |
return textArrays?.map((textArray) => { | |
// get the text by slicing off the first element and the last 3 elements | |
const text = getTextFromComment(textArray) | |
return { | |
id: textArray?.join('-')?.toLowerCase(), | |
author_name: textArray?.[0], | |
comment: text, | |
email: getEmailFromText(text), | |
} | |
}) | |
} | |
function getAllPosts() { | |
const posts = document.querySelectorAll('div[role=feed] > div') | |
return [...posts].filter((post) => { | |
const posterName = post?.querySelector('h3')?.textContent | |
if (posterName) { | |
return true | |
} | |
return false | |
}) | |
} | |
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) | |
function closeDialog() { | |
const closeButton = document?.querySelector('div[aria-label="Close"]') | |
if (!closeButton) { | |
return | |
} | |
closeButton.click() | |
} | |
function getPostText(post) { | |
let postText = post?.querySelector( | |
'div > div > div > div > div > div > div > div > div > div:nth-child(8) > div > div > div:nth-child(3) > div', | |
) | |
// const postText = post?.querySelector('div[data-ad-preview="message"]'); // this doesn't work for all of them for some reason | |
if (!postText) { | |
return | |
} | |
return traverseElements(postText)?.join(' ') | |
} | |
function clickSeeMoreIfItsThere(post) { | |
const buttonDivs = post.querySelectorAll('div[role="button"]') | |
for (let i = 0; i < buttonDivs.length; i++) { | |
const div = buttonDivs[i] | |
// Check if the text content is "See more" | |
if (div.textContent.trim() === 'See more') { | |
// Perform actions on the matched div | |
div.click() | |
} | |
} | |
} | |
function getAllCommentsAndFormat(post, comments) { | |
return comments.map((comment) => { | |
return { | |
post: post?.post, | |
is_comment: true, | |
...comment, | |
} | |
}) | |
} | |
function getPostId(posterName, postText) { | |
return `${posterName?.split(' ')?.join('-')}-${postText | |
?.split(' ') | |
?.join('-')}`?.toLowerCase() | |
} | |
async function run() { | |
console.log('starting...') | |
const allContent = [] | |
let posts = getAllPosts() | |
console.log('posts.length', posts.length) | |
let i = 0 | |
while (i < posts.length) { | |
const post = posts[i] | |
console.log( | |
`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`, | |
) | |
const posterName = post?.querySelector('h3')?.textContent | |
console.log('posterName', posterName) | |
clickSeeMoreIfItsThere(post) | |
await sleep(1000) | |
const postText = getPostText(post) | |
const commentsDisplayedWithoutClicking = extractComments(post) | |
clickOnComments(post) | |
await sleep(1000) | |
const commentsAfterClickingModal = extractComments() | |
closeDialog() | |
const content = { | |
id: getPostId(posterName, postText), | |
is_post: true, | |
author_name: posterName, | |
first_name: posterName?.split(' ')?.[0], | |
last_name: posterName?.split(' ')?.[1], | |
post: postText, | |
email: getEmailFromText(postText), | |
} | |
const comments = getAllCommentsAndFormat(content, [ | |
...commentsDisplayedWithoutClicking, | |
...commentsAfterClickingModal, | |
]) | |
allContent.push(content) | |
allContent.push(...comments) | |
i++ | |
if (scrolls > 0) { | |
await scrollDown() | |
scrolls-- | |
const currentPosts = getAllPosts() | |
console.log('currentPosts', currentPosts.length) | |
posts = currentPosts | |
} | |
} | |
const unique = [] | |
const uniqueIds = [] | |
allContent.forEach((content) => { | |
if (!uniqueIds.includes(content.id)) { | |
unique.push(content) | |
uniqueIds.push(content.id) | |
} | |
}) | |
console.log('done!') | |
console.log('allContent', unique) | |
createCSV(unique, 'facebookGroupPostAndComments.csv') | |
console.log( | |
`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: [email protected]`, | |
) | |
} | |
// NOTE: to increase the number of posts, increase the "scrolls" variable below | |
let scrolls = 5 | |
await run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to use it