-
-
Save AliAryanTech/d89b3943efc03451dd11bb96e47146d6 to your computer and use it in GitHub Desktop.
Scrape Facebook Group
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const allContent = [] | |
function createCSV(data, fileName) { | |
const headers = [ | |
'id', | |
'email', | |
'firstName', | |
'lastName', | |
'postId', | |
'postText', | |
'postAuthor', | |
'postAuthorId', | |
'postAuthorUrl', | |
'commentId', | |
'commentText', | |
'commentAuthorName', | |
'commentAuthorId', | |
'commentAuthorUrl', | |
'timestamp', | |
'commentUrl', | |
] | |
const csvContent = [ | |
headers.join(','), | |
...data.map((row) => | |
headers | |
.map((header) => { | |
const value = row[header] | |
if (value === null) return 'null' | |
if (typeof value === 'string') { | |
// Wrap all fields, including those without commas, in double quotes | |
return `"${value.replace(/"/g, '""')}"` | |
} | |
return value | |
}) | |
.join(','), | |
), | |
].join('\n') | |
const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }) | |
const link = document.createElement('a') | |
if (navigator.msSaveBlob) { | |
// IE 10+ | |
navigator.msSaveBlob(blob, fileName) | |
} else { | |
const url = URL.createObjectURL(blob) | |
link.setAttribute('href', url) | |
link.setAttribute('download', fileName || 'data.csv') | |
document.body.appendChild(link) | |
link.click() | |
document.body.removeChild(link) | |
URL.revokeObjectURL(url) | |
} | |
} | |
async function scrollDown() { | |
// const wrapper = document.querySelector("#search-page-list-container"); | |
const wrapper = window | |
await new Promise((resolve, reject) => { | |
var totalHeight = 0 | |
var distance = 800 | |
var timer = setInterval(async () => { | |
var scrollHeightBefore = wrapper.scrollHeight | |
wrapper.scrollBy(0, distance) | |
totalHeight += distance | |
clearInterval(timer) | |
resolve() | |
}, 400) | |
}) | |
await new Promise((resolve) => setTimeout(resolve, 1000)) | |
} | |
function getEmailFromText(text) { | |
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g | |
const email = text?.match(emailRegex)?.[0] | |
return email || '' | |
} | |
function clickOnComments(post) { | |
// Get all divs on the page | |
var allDivs = post.getElementsByTagName('div') | |
// Create an array to store matching divs | |
var matchingDivs = [] | |
// Loop through each div | |
for (var i = 0; i < allDivs.length; i++) { | |
// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic" | |
if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') { | |
// Add the matching div to the array | |
matchingDivs.push(allDivs[i]) | |
const thingToClickToOpenComments = | |
allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0] | |
?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0] | |
?.children?.[0] | |
if (thingToClickToOpenComments) { | |
thingToClickToOpenComments.click() | |
} | |
} | |
} | |
} | |
// Function to recursively traverse HTML elements and return text in an array | |
function traverseElementsToGetText(element) { | |
var textArray = [] | |
// Check if the element has child nodes | |
if (element.childNodes.length > 0) { | |
// Loop through each child node | |
for (var i = 0; i < element.childNodes.length; i++) { | |
// Recursively call the function for each child node | |
textArray = textArray.concat( | |
traverseElementsToGetText(element.childNodes[i]), | |
) | |
} | |
} else { | |
// If the element is a text node and contains non-whitespace text | |
if ( | |
element.nodeType === Node.TEXT_NODE && | |
element.nodeValue.trim() !== '' | |
) { | |
// Push the text into the text array | |
textArray.push(element.nodeValue.trim()) | |
} | |
} | |
return textArray | |
} | |
function getAllPosts() { | |
const posts = document.querySelectorAll('div[role=feed] > div') | |
return [...posts].filter((post) => { | |
const posterName = post?.querySelector('h2')?.textContent | |
if (posterName) { | |
return true | |
} | |
return false | |
}) | |
} | |
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) | |
function closeDialog() { | |
const closeButton = document?.querySelector('div[aria-label="Close"]') | |
if (!closeButton) { | |
return | |
} | |
closeButton.click() | |
} | |
function formatTopLevelComments(postId, topLevelComments = []) { | |
return topLevelComments.map((c) => { | |
const text = c?.comment.body.text | |
const commentId = c?.comment.id | |
const authorName = c?.comment.author.name | |
const authorId = c?.comment.author.id | |
return { | |
id: commentId, | |
commentId, | |
postId, | |
commentText: text || '', | |
commentAuthorName: authorName, | |
commentAuthorId: authorId, | |
email: getEmailFromText(text), | |
firstName: authorName?.split(' ')?.[0], | |
lastName: authorName?.split(' ')?.[1], | |
} | |
}) | |
} | |
function parseFirstLevelJson(json) { | |
const actor = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content | |
?.story?.comet_sections?.context_layout?.story?.comet_sections | |
?.actor_photo?.story?.actors?.[0] | |
const postText = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.content | |
?.story?.comet_sections?.message_container?.story?.message?.text | |
const postId = | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback | |
?.story?.post_id | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: actor?.name, | |
postAuthorId: actor?.id, | |
postAuthorUrl: actor?.url, | |
email: getEmailFromText(postText), | |
firstName: actor?.name?.split(' ')?.[0], | |
lastName: actor?.name?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
json?.data?.node?.group_feed?.edges?.[0]?.node?.comet_sections?.feedback | |
?.story?.feedback_context?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function parseSecondLevelJson(json) { | |
const data2 = json | |
const actor = | |
data2?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0] | |
const posterName = actor?.name | |
const postText = | |
data2?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.message_container?.story?.message?.text | |
const id = actor?.id | |
const postId = data2?.data?.node?.comet_sections?.feedback?.story?.post_id | |
const url = actor?.url | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: posterName, | |
postAuthorId: id, | |
postAuthorUrl: url, | |
email: getEmailFromText(postText), | |
firstName: posterName?.split(' ')?.[0], | |
lastName: posterName?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
data2?.data?.node?.comet_sections?.feedback?.story?.feedback_context | |
?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function parseThirdLevelJson(json) { | |
const data3 = json | |
const actor3 = | |
data3?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.context_layout?.story?.comet_sections?.actor_photo?.story?.actors?.[0] | |
const posterName = actor3?.name | |
const postText = | |
data3?.data?.node?.comet_sections?.content?.story?.comet_sections | |
?.message_container?.story?.message?.text | |
const posterId = actor3?.id | |
const postId = data3?.data?.node?.comet_sections?.feedback?.story?.post_id | |
const url = actor3?.url | |
const post = { | |
id: postId, | |
postId, | |
postText: postText || '', | |
postAuthor: posterName, | |
postAuthorId: posterId, | |
postAuthorUrl: url, | |
email: getEmailFromText(postText), | |
firstName: posterName?.split(' ')?.[0], | |
lastName: posterName?.split(' ')?.[1], | |
} | |
const topLevelComments = formatTopLevelComments( | |
postId, | |
data3?.data?.node?.comet_sections?.feedback?.story?.feedback_context | |
?.interesting_top_level_comments, | |
) | |
return { | |
post, | |
topLevelComments, | |
} | |
} | |
function addCommentsToAllContent(comments = []) { | |
comments.forEach((c) => { | |
if (allContent?.find((f) => f.commentId === c.commentId)) { | |
} else { | |
allContent.push(c) | |
} | |
}) | |
} | |
function interceptRequests() { | |
let oldXHROpen = window.XMLHttpRequest.prototype.open | |
window.XMLHttpRequest.prototype.open = function (method, url, async) { | |
if (!url.includes('graphql')) { | |
return oldXHROpen.apply(this, arguments) | |
} | |
// Capture the request body | |
let requestBody = null | |
// Override the send method to capture the request body | |
let oldXHRSend = this.send | |
this.send = function (data) { | |
requestBody = data | |
oldXHRSend.apply(this, arguments) | |
} | |
// Listen for the 'load' event to capture the response | |
this.addEventListener('load', function () { | |
if ( | |
requestBody?.includes('GroupsCometFeedRegularStoriesPaginationQuery') | |
) { | |
console.log('getting posts') | |
// we're getting posts.... | |
const payload = this.responseText | |
const lines = payload.split('\n') | |
const data1 = JSON.parse(lines[0]) | |
const firstPost = parseFirstLevelJson(data1) | |
console.log('firstPost', firstPost) | |
const data2 = JSON.parse(lines[1]) | |
const secondPost = parseSecondLevelJson(data2) | |
console.log('secondPost', secondPost) | |
const data3 = JSON.parse(lines[2]) | |
const thirdPost = parseThirdLevelJson(data3) | |
console.log('thirdPost', thirdPost) | |
allContent.push(firstPost.post) | |
addCommentsToAllContent(firstPost.topLevelComments) | |
allContent.push(secondPost.post) | |
addCommentsToAllContent(secondPost.topLevelComments) | |
allContent.push(thirdPost.post) | |
addCommentsToAllContent(thirdPost.topLevelComments) | |
// | |
} else if (requestBody?.includes('CometFocusedStoryViewUFIQuery')) { | |
console.log('getting comments') | |
// we're getting comments | |
let data = null | |
try { | |
data = JSON.parse(this.responseText) | |
} catch (e) {} | |
const postId = data?.data?.story_card?.post_id | |
const comments = | |
data?.data?.feedback?.ufi_renderer?.feedback?.comment_list_renderer?.feedback?.comment_rendering_instance_for_feed_location?.comments?.edges?.map( | |
(blah) => { | |
const comment = blah?.node | |
const commentId = comment?.id | |
const commentText = comment?.body?.text | |
const authorName = comment?.author?.name | |
const authorId = comment?.author?.id | |
const authorUrl = comment?.author?.url | |
const timeStuff = comment?.comment_action_links?.find( | |
(f) => f?.__typename === 'XFBCommentTimeStampActionLink', | |
)?.comment | |
const timestamp = timeStuff?.created_time | |
const commentUrl = timeStuff?.url | |
const email = getEmailFromText(commentText) | |
return { | |
id: commentId, | |
commentId, | |
postId, | |
commentText, | |
commentAuthorName: authorName, | |
commentAuthorId: authorId, | |
commentAuthorUrl: authorUrl, | |
timestamp, | |
commentUrl, | |
email, | |
firstName: authorName?.split(' ')?.[0], | |
lastName: authorName?.split(' ')?.[1], | |
} | |
}, | |
) | |
addCommentsToAllContent(comments) | |
console.log('comments', comments) | |
} else { | |
return | |
} | |
}) | |
// Call the original open method | |
return oldXHROpen.apply(this, arguments) | |
} | |
} | |
async function run() { | |
interceptRequests() | |
console.log('starting...') | |
let posts = getAllPosts() | |
console.log('posts.length', posts.length) | |
let i = 0 | |
while (i < posts.length) { | |
const post = posts[i] | |
console.log( | |
`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`, | |
) | |
clickOnComments(post) | |
await sleep(1000) | |
closeDialog() | |
i++ | |
if (scrolls > 0) { | |
await scrollDown() | |
scrolls-- | |
console.log('scrolls left', scrolls) | |
console.log('old posts', posts.length) | |
const currentPosts = getAllPosts() | |
console.log('currentPosts', currentPosts.length) | |
posts = currentPosts | |
} | |
} | |
createCSV(allContent, 'facebookGroupPostsAndComments.csv') | |
console.log('allContent', allContent) | |
console.log('done!') | |
console.log( | |
`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: [email protected]`, | |
) | |
} | |
let scrolls = 50 | |
// NOTE: Only gets the first level comments | |
await run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment