1337hero · January 23, 2024 16:43 · 1337hero · Jan 23, 2024
diff --git a/fb-email-scraper.js b/fb-email-scraper.js
 function createCSV(data, fileName) {
  const headers = [
    'id',
    'author_name',
    'post',
    'is_post',
    'comment',
    'is_comment',
    'first_name',
    'last_name',
    'email',
  ]

  const csvContent = [
    headers.join(','),
    ...data.map((row) =>
      headers
        .map((header) => {
          const value = row[header]
          if (value === null) return 'null'
          if (typeof value === 'string') {
            // Wrap all fields, including those without commas, in double quotes
            return `"${value.replace(/"/g, '""')}"`
          }
          return value
        })
        .join(','),
    ),
  ].join('\n')

  const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' })
  const link = document.createElement('a')

  if (navigator.msSaveBlob) {
    // IE 10+
    navigator.msSaveBlob(blob, fileName)
  } else {
    const url = URL.createObjectURL(blob)

    link.setAttribute('href', url)
    link.setAttribute('download', fileName || 'data.csv')
    document.body.appendChild(link)

    link.click()

    document.body.removeChild(link)
    URL.revokeObjectURL(url)
  }
 }

 async function scrollDown() {
  // const wrapper = document.querySelector("#search-page-list-container");
  const wrapper = window
  await new Promise((resolve, reject) => {
    var totalHeight = 0
    var distance = 2000

    var timer = setInterval(async () => {
      var scrollHeightBefore = wrapper.scrollHeight
      wrapper.scrollBy(0, distance)
      totalHeight += distance

      clearInterval(timer)
      resolve()
    }, 400)
  })
  await new Promise((resolve) => setTimeout(resolve, 1000))
 }

 function getEmailFromText(text) {
  const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
  const email = text?.match(emailRegex)?.[0]
  return email || ''
 }

 function clickOnComments(post) {
  // Get all divs on the page
  var allDivs = post.getElementsByTagName('div')

  // Create an array to store matching divs
  var matchingDivs = []

  // Loop through each div
  for (var i = 0; i < allDivs.length; i++) {
    // Check if the div has the attribute data-visualcompletion set to "ignore-dynamic"
    if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') {
      // Add the matching div to the array
      matchingDivs.push(allDivs[i])
      const thingToClickToOpenComments =
        allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0]
          ?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0]
          ?.children?.[0]
      if (thingToClickToOpenComments) {
        thingToClickToOpenComments.click()
      }
    }
  }
 }

 // Function to recursively traverse HTML elements and return text in an array
 function traverseElements(element) {
  var textArray = []

  // Check if the element has child nodes
  if (element.childNodes.length > 0) {
    // Loop through each child node
    for (var i = 0; i < element.childNodes.length; i++) {
      // Recursively call the function for each child node
      textArray = textArray.concat(traverseElements(element.childNodes[i]))
    }
  } else {
    // If the element is a text node and contains non-whitespace text
    if (
      element.nodeType === Node.TEXT_NODE &&
      element.nodeValue.trim() !== ''
    ) {
      // Push the text into the text array
      textArray.push(element.nodeValue.trim())
    }
  }

  return textArray
 }

 function getTextFromComment(textArray) {
  return textArray
    ?.filter((section) => {
      if (section === 'Reply') {
        return false
      }
      if (section?.match(/^\d+$/)) {
        return false
      }
      if (section === 'Like') {
        return false
      }
      if (section === 'Top Contributor') {
        return false
      }
      if (section === 'Follow') {
        return false
      }
      if (section === '·') {
        return false
      }
      return true
    })
    ?.slice(1, textArray.length - 3)
    ?.join(' ')
 }

 function extractComments(post = undefined) {
  let parent = null
  if (post) {
    parent = post
  } else {
    const dialog = document?.querySelector('div[role=dialog]')
    parent = dialog
  }

  if (!parent) {
    return []
  }

  var allDivs = parent.getElementsByTagName('div')

  // Array to store extracted text arrays
  var textArrays = []

  // Loop through each div
  for (var i = 0; i < allDivs.length; i++) {
    // Check if the div has the aria-label attribute starting with "Comment by"
    var ariaLabel = allDivs[i].getAttribute('aria-label')

    if (ariaLabel && ariaLabel.startsWith('Comment by')) {
      // Call the recursive function to traverse and extract text
      var elementTextArray = traverseElements(allDivs[i])

      // Push the text array to the top-level array
      textArrays.push(elementTextArray)
    }
  }

  return textArrays?.map((textArray) => {
    // get the text by slicing off the first element and the last 3 elements
    const text = getTextFromComment(textArray)

    return {
      id: textArray?.join('-')?.toLowerCase(),
      author_name: textArray?.[0],
      comment: text,
      email: getEmailFromText(text),
    }
  })
 }

 function getAllPosts() {
  const posts = document.querySelectorAll('div[role=feed] > div')
  return [...posts].filter((post) => {
    const posterName = post?.querySelector('h3')?.textContent
    if (posterName) {
      return true
    }
    return false
  })
 }

 const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))

 function closeDialog() {
  const closeButton = document?.querySelector('div[aria-label="Close"]')
  if (!closeButton) {
    return
  }
  closeButton.click()
 }

 function getPostText(post) {
  let postText = post?.querySelector(
    'div > div > div > div > div > div > div > div > div > div:nth-child(8) > div > div > div:nth-child(3) > div',
  )
  // const postText = post?.querySelector('div[data-ad-preview="message"]'); // this doesn't work for all of them for some reason

  if (!postText) {
    return
  }
  return traverseElements(postText)?.join(' ')
 }

 function clickSeeMoreIfItsThere(post) {
  const buttonDivs = post.querySelectorAll('div[role="button"]')

  for (let i = 0; i < buttonDivs.length; i++) {
    const div = buttonDivs[i]
    // Check if the text content is "See more"
    if (div.textContent.trim() === 'See more') {
      // Perform actions on the matched div
      div.click()
    }
  }
 }

 function getAllCommentsAndFormat(post, comments) {
  return comments.map((comment) => {
    return {
      post: post?.post,
      is_comment: true,
      ...comment,
    }
  })
 }

 function getPostId(posterName, postText) {
  return `${posterName?.split(' ')?.join('-')}-${postText
    ?.split(' ')
    ?.join('-')}`?.toLowerCase()
 }

 async function run() {
  console.log('starting...')
  const allContent = []
  let posts = getAllPosts()
  console.log('posts.length', posts.length)
  let i = 0

  while (i < posts.length) {
    const post = posts[i]
    console.log(
      `while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`,
    )
    const posterName = post?.querySelector('h3')?.textContent
    console.log('posterName', posterName)
    clickSeeMoreIfItsThere(post)
    await sleep(1000)
    const postText = getPostText(post)
    const commentsDisplayedWithoutClicking = extractComments(post)
    clickOnComments(post)
    await sleep(1000)
    const commentsAfterClickingModal = extractComments()
    closeDialog()

    const content = {
      id: getPostId(posterName, postText),
      is_post: true,
      author_name: posterName,
      first_name: posterName?.split(' ')?.[0],
      last_name: posterName?.split(' ')?.[1],
      post: postText,
      email: getEmailFromText(postText),
    }

    const comments = getAllCommentsAndFormat(content, [
      ...commentsDisplayedWithoutClicking,
      ...commentsAfterClickingModal,
    ])

    allContent.push(content)
    allContent.push(...comments)
    i++
    if (scrolls > 0) {
      await scrollDown()
      scrolls--
      const currentPosts = getAllPosts()
      console.log('currentPosts', currentPosts.length)
      posts = currentPosts
    }
  }

  const unique = []
  const uniqueIds = []
  allContent.forEach((content) => {
    if (!uniqueIds.includes(content.id)) {
      unique.push(content)
      uniqueIds.push(content.id)
    }
  })

  console.log('done!')
  console.log('allContent', unique)
  createCSV(unique, 'facebookGroupPostAndComments.csv')
  console.log(
    `Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: [email protected]`,
  )
 }

 // NOTE: to increase the number of posts, increase the "scrolls" variable below
 let scrolls = 5
 await run()
	function createCSV(data, fileName) {
	const headers = [
	'id',
	'author_name',
	'post',
	'is_post',
	'comment',
	'is_comment',
	'first_name',
	'last_name',
	'email',
	]

	const csvContent = [
	headers.join(','),
	...data.map((row) =>
	headers
	.map((header) => {
	const value = row[header]
	if (value === null) return 'null'
	if (typeof value === 'string') {
	// Wrap all fields, including those without commas, in double quotes
	return `"${value.replace(/"/g, '""')}"`
	}
	return value
	})
	.join(','),
	),
	].join('\n')

	const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' })
	const link = document.createElement('a')

	if (navigator.msSaveBlob) {
	// IE 10+
	navigator.msSaveBlob(blob, fileName)
	} else {
	const url = URL.createObjectURL(blob)

	link.setAttribute('href', url)
	link.setAttribute('download', fileName \|\| 'data.csv')
	document.body.appendChild(link)

	link.click()

	document.body.removeChild(link)
	URL.revokeObjectURL(url)
	}
	}

	async function scrollDown() {
	// const wrapper = document.querySelector("#search-page-list-container");
	const wrapper = window
	await new Promise((resolve, reject) => {
	var totalHeight = 0
	var distance = 2000

	var timer = setInterval(async () => {
	var scrollHeightBefore = wrapper.scrollHeight
	wrapper.scrollBy(0, distance)
	totalHeight += distance

	clearInterval(timer)
	resolve()
	}, 400)
	})
	await new Promise((resolve) => setTimeout(resolve, 1000))
	}

	function getEmailFromText(text) {
	const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
	const email = text?.match(emailRegex)?.[0]
	return email \|\| ''
	}

	function clickOnComments(post) {
	// Get all divs on the page
	var allDivs = post.getElementsByTagName('div')

	// Create an array to store matching divs
	var matchingDivs = []

	// Loop through each div
	for (var i = 0; i < allDivs.length; i++) {
	// Check if the div has the attribute data-visualcompletion set to "ignore-dynamic"
	if (allDivs[i].getAttribute('data-visualcompletion') === 'ignore-dynamic') {
	// Add the matching div to the array
	matchingDivs.push(allDivs[i])
	const thingToClickToOpenComments =
	allDivs?.[i]?.children?.[0]?.children?.[0]?.children?.[0]?.children?.[0]
	?.children?.[0]?.children?.[1]?.children?.[1]?.children?.[0]
	?.children?.[0]
	if (thingToClickToOpenComments) {
	thingToClickToOpenComments.click()
	}
	}
	}
	}

	// Function to recursively traverse HTML elements and return text in an array
	function traverseElements(element) {
	var textArray = []

	// Check if the element has child nodes
	if (element.childNodes.length > 0) {
	// Loop through each child node
	for (var i = 0; i < element.childNodes.length; i++) {
	// Recursively call the function for each child node
	textArray = textArray.concat(traverseElements(element.childNodes[i]))
	}
	} else {
	// If the element is a text node and contains non-whitespace text
	if (
	element.nodeType === Node.TEXT_NODE &&
	element.nodeValue.trim() !== ''
	) {
	// Push the text into the text array
	textArray.push(element.nodeValue.trim())
	}
	}

	return textArray
	}

	function getTextFromComment(textArray) {
	return textArray
	?.filter((section) => {
	if (section === 'Reply') {
	return false
	}
	if (section?.match(/^\d+$/)) {
	return false
	}
	if (section === 'Like') {
	return false
	}
	if (section === 'Top Contributor') {
	return false
	}
	if (section === 'Follow') {
	return false
	}
	if (section === '·') {
	return false
	}
	return true
	})
	?.slice(1, textArray.length - 3)
	?.join(' ')
	}

	function extractComments(post = undefined) {
	let parent = null
	if (post) {
	parent = post
	} else {
	const dialog = document?.querySelector('div[role=dialog]')
	parent = dialog
	}

	if (!parent) {
	return []
	}

	var allDivs = parent.getElementsByTagName('div')

	// Array to store extracted text arrays
	var textArrays = []

	// Loop through each div
	for (var i = 0; i < allDivs.length; i++) {
	// Check if the div has the aria-label attribute starting with "Comment by"
	var ariaLabel = allDivs[i].getAttribute('aria-label')

	if (ariaLabel && ariaLabel.startsWith('Comment by')) {
	// Call the recursive function to traverse and extract text
	var elementTextArray = traverseElements(allDivs[i])

	// Push the text array to the top-level array
	textArrays.push(elementTextArray)
	}
	}

	return textArrays?.map((textArray) => {
	// get the text by slicing off the first element and the last 3 elements
	const text = getTextFromComment(textArray)

	return {
	id: textArray?.join('-')?.toLowerCase(),
	author_name: textArray?.[0],
	comment: text,
	email: getEmailFromText(text),
	}
	})
	}

	function getAllPosts() {
	const posts = document.querySelectorAll('div[role=feed] > div')
	return [...posts].filter((post) => {
	const posterName = post?.querySelector('h3')?.textContent
	if (posterName) {
	return true
	}
	return false
	})
	}

	const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))

	function closeDialog() {
	const closeButton = document?.querySelector('div[aria-label="Close"]')
	if (!closeButton) {
	return
	}
	closeButton.click()
	}

	function getPostText(post) {
	let postText = post?.querySelector(
	'div > div > div > div > div > div > div > div > div > div:nth-child(8) > div > div > div:nth-child(3) > div',
	)
	// const postText = post?.querySelector('div[data-ad-preview="message"]'); // this doesn't work for all of them for some reason

	if (!postText) {
	return
	}
	return traverseElements(postText)?.join(' ')
	}

	function clickSeeMoreIfItsThere(post) {
	const buttonDivs = post.querySelectorAll('div[role="button"]')

	for (let i = 0; i < buttonDivs.length; i++) {
	const div = buttonDivs[i]
	// Check if the text content is "See more"
	if (div.textContent.trim() === 'See more') {
	// Perform actions on the matched div
	div.click()
	}
	}
	}

	function getAllCommentsAndFormat(post, comments) {
	return comments.map((comment) => {
	return {
	post: post?.post,
	is_comment: true,
	...comment,
	}
	})
	}

	function getPostId(posterName, postText) {
	return `${posterName?.split(' ')?.join('-')}-${postText
	?.split(' ')
	?.join('-')}`?.toLowerCase()
	}

	async function run() {
	console.log('starting...')
	const allContent = []
	let posts = getAllPosts()
	console.log('posts.length', posts.length)
	let i = 0

	while (i < posts.length) {
	const post = posts[i]
	console.log(
	`while you're waiting, why not check out https://thewebscrapingguy.com/? 😅`,
	)
	const posterName = post?.querySelector('h3')?.textContent
	console.log('posterName', posterName)
	clickSeeMoreIfItsThere(post)
	await sleep(1000)
	const postText = getPostText(post)
	const commentsDisplayedWithoutClicking = extractComments(post)
	clickOnComments(post)
	await sleep(1000)
	const commentsAfterClickingModal = extractComments()
	closeDialog()

	const content = {
	id: getPostId(posterName, postText),
	is_post: true,
	author_name: posterName,
	first_name: posterName?.split(' ')?.[0],
	last_name: posterName?.split(' ')?.[1],
	post: postText,
	email: getEmailFromText(postText),
	}

	const comments = getAllCommentsAndFormat(content, [
	...commentsDisplayedWithoutClicking,
	...commentsAfterClickingModal,
	])

	allContent.push(content)
	allContent.push(...comments)
	i++
	if (scrolls > 0) {
	await scrollDown()
	scrolls--
	const currentPosts = getAllPosts()
	console.log('currentPosts', currentPosts.length)
	posts = currentPosts
	}
	}

	const unique = []
	const uniqueIds = []
	allContent.forEach((content) => {
	if (!uniqueIds.includes(content.id)) {
	unique.push(content)
	uniqueIds.push(content.id)
	}
	})

	console.log('done!')
	console.log('allContent', unique)
	createCSV(unique, 'facebookGroupPostAndComments.csv')
	console.log(
	`Congrats! 🎉 You scraped a sh*t ton of posts! If you need any custom scrapers built, email me: [email protected]`,
	)
	}

	// NOTE: to increase the number of posts, increase the "scrolls" variable below
	let scrolls = 5
	await run()