callmephil · February 26, 2025 12:40
diff --git a/linkedin_scrapper.js b/linkedin_scrapper.js
 (function () {
  const Selectors = {
    COMMENTER_NAME: ".comments-comment-meta__description-title",
    COMMENT_TEXT: ".comments-comment-item__main-content",
    LIKES_COUNT: ".comments-comment-social-bar__reactions-count--cr span.v-align-middle",
    REPLY_ELEMENTS: ":scope article.comments-comment-entity--reply",
    TOP_LEVEL_COMMENTS: "article.comments-comment-entity:not(.comments-comment-entity--reply)",
    LOAD_MORE_BUTTON: ".comments-comments-list__load-more-comments-button--cr",
  };

  function parseElement(element, selector, onEmpty) {
    const el = element.querySelector(selector);
    return el ? el.innerText.trim() : onEmpty;
  }

  function parseElements(element, selector, onElement, onEmpty) {
    const elements = element.querySelectorAll(selector);
    return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
  }

  /**
   * Recursively parse a comment (and any nested replies).
   * @param {HTMLElement} commentEl
   * @returns {Object}
   */
  function parseComment(commentEl) {
    const name = parseElement(commentEl, Selectors.COMMENTER_NAME, "");
    const comment = parseElement(commentEl, Selectors.COMMENT_TEXT, "");
    const likes = parseElement(commentEl, Selectors.LIKES_COUNT, "0");
    const replies = parseElements(
      commentEl,
      Selectors.REPLY_ELEMENTS,
      (replyEl) => parseComment(replyEl),
      []
    );

    return {
      name,
      comment,
      likes,
      // TODO: add an api that can analyze the sentiment of the comment
      // sentiment, summary, usefulness score...
      analysis: {},
      replies,
    };
  }

  /**
   * Generate a file and trigger a download.
   * @param {string} filename
   * @param {string} content
   */
  function downloadFile(filename, content) {
    const blob = new Blob([content], { type: "application/json" });
    const url = URL.createObjectURL(blob);
    const a = document.createElement("a");
    a.href = url;
    a.download = filename;
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    URL.revokeObjectURL(url);
  }

  /**
   * Load all comments by clicking the "View More" button until it disappears.
   * @param {Function} callback
   */
  function loadAllComments(callback) {
    const interval = setInterval(() => {
      const loadMoreButton = document.querySelector(Selectors.LOAD_MORE_BUTTON);
      if (loadMoreButton) {
        loadMoreButton.click();
      } else {
        clearInterval(interval);
        callback();
      }
    }, 1000);
  }

  // Load all comments and then parse them
  loadAllComments(() => {
    // Select only top-level comments (exclude those marked as reply)
    const topLevelComments = document.querySelectorAll(Selectors.TOP_LEVEL_COMMENTS);

    // Parse each top-level comment (recursively gathering replies)
    const results = Array.from(topLevelComments).map((article) => parseComment(article));

    // Print out the final structured data
    console.log(JSON.stringify(results, null, 2));

    // Trigger the download of the parsed comments as a JSON file
    downloadFile("linkedin_post.json", JSON.stringify(results, null, 2));
  });
 })();
diff --git a/threads_scrapper.js b/threads_scrapper.js
 (function () {
  // 1) Define your custom selectors based on the snippet
  const Selectors = {
    THREAD_CONTAINER: ".x78zum5.xdt5ytf.x1iyjqo2.x1n2onr6 > .x78zum5.xdt5ytf",
    COMMENTER_NAME: ".x1lliihq.x193iq5w.x6ikm8r.x10wlt62.xlyipyv.xuxw1ft",
    COMMENT_TEXT:
      ".x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x1ji0vk5.x18bv5gf.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x1i0vuye.xjohtrz.xo1l8bm.xp07o12.x1yc453h.xat24cr.xdj266r",
    LIKES_COUNT: ".x17qophe.x10l6tqk.x13vifvy",
  };

  // 2) Helper function to safely fetch text
  function parseElement(element, selector, defaultVal = "") {
    const el = element.querySelector(selector);
    return el ? el.innerText.trim() : defaultVal;
  }

  function parseElements(element, selector, onElement, onEmpty) {
    const elements = element.querySelectorAll(selector);
    return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
  }

  // 3) Parse a single top-level “comment” (thread)
  function parseComment(threadEl) {
    const name = parseElement(threadEl, Selectors.COMMENTER_NAME);
    const comment = parseElement(threadEl, Selectors.COMMENT_TEXT);
    const likes = parseElement(threadEl, Selectors.LIKES_COUNT, "0");
    const replies = parseElements(threadEl, ".x1a2a7pz.x1n2onr6", parseComment, []);

    // If you find real nested replies in the DOM, handle them here
    return {
      name,
      comment,
      likes,
      analysis: {},
      replies: replies,
    };
  }

  // 4) Infinite scroll until no more content loads
  function loadAllThreads(callback) {
    let lastHeight = 0;
    let sameHeightCount = 0;
    let attemptsLeft = 10;

    const interval = setInterval(() => {
      window.scrollTo(0, document.body.scrollHeight);

      const currentHeight = document.body.scrollHeight;
      if (currentHeight === lastHeight) {
        sameHeightCount++;
      } else {
        sameHeightCount = 0;
        lastHeight = currentHeight;
      }

      if (sameHeightCount > 2 || --attemptsLeft <= 0) {
        clearInterval(interval);
        callback();
      }
    }, 3000);
  }

  // 5) Download JSON
  function downloadFile(filename, content) {
    const blob = new Blob([content], { type: "application/json" });
    const url = URL.createObjectURL(blob);
    const a = document.createElement("a");
    a.href = url;
    a.download = filename;
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    URL.revokeObjectURL(url);
  }

  // 6) Orchestrate
  loadAllThreads(() => {
    console.log("Done scrolling, now parsing...");

    const containers = document.querySelectorAll(Selectors.THREAD_CONTAINER);
    const data = Array.from(containers).map((el) => parseComment(el));

    console.log("Parsed data:", data);
    console.log(JSON.stringify(data, null, 2));

    // optional: trigger download
    downloadFile("threads_data.json", JSON.stringify(data, null, 2));
  });
 })();
	(function () {
	const Selectors = {
	COMMENTER_NAME: ".comments-comment-meta__description-title",
	COMMENT_TEXT: ".comments-comment-item__main-content",
	LIKES_COUNT: ".comments-comment-social-bar__reactions-count--cr span.v-align-middle",
	REPLY_ELEMENTS: ":scope article.comments-comment-entity--reply",
	TOP_LEVEL_COMMENTS: "article.comments-comment-entity:not(.comments-comment-entity--reply)",
	LOAD_MORE_BUTTON: ".comments-comments-list__load-more-comments-button--cr",
	};

	function parseElement(element, selector, onEmpty) {
	const el = element.querySelector(selector);
	return el ? el.innerText.trim() : onEmpty;
	}

	function parseElements(element, selector, onElement, onEmpty) {
	const elements = element.querySelectorAll(selector);
	return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
	}

	/**
	* Recursively parse a comment (and any nested replies).
	* @param {HTMLElement} commentEl
	* @returns {Object}
	*/
	function parseComment(commentEl) {
	const name = parseElement(commentEl, Selectors.COMMENTER_NAME, "");
	const comment = parseElement(commentEl, Selectors.COMMENT_TEXT, "");
	const likes = parseElement(commentEl, Selectors.LIKES_COUNT, "0");
	const replies = parseElements(
	commentEl,
	Selectors.REPLY_ELEMENTS,
	(replyEl) => parseComment(replyEl),
	[]
	);

	return {
	name,
	comment,
	likes,
	// TODO: add an api that can analyze the sentiment of the comment
	// sentiment, summary, usefulness score...
	analysis: {},
	replies,
	};
	}

	/**
	* Generate a file and trigger a download.
	* @param {string} filename
	* @param {string} content
	*/
	function downloadFile(filename, content) {
	const blob = new Blob([content], { type: "application/json" });
	const url = URL.createObjectURL(blob);
	const a = document.createElement("a");
	a.href = url;
	a.download = filename;
	document.body.appendChild(a);
	a.click();
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}

	/**
	* Load all comments by clicking the "View More" button until it disappears.
	* @param {Function} callback
	*/
	function loadAllComments(callback) {
	const interval = setInterval(() => {
	const loadMoreButton = document.querySelector(Selectors.LOAD_MORE_BUTTON);
	if (loadMoreButton) {
	loadMoreButton.click();
	} else {
	clearInterval(interval);
	callback();
	}
	}, 1000);
	}

	// Load all comments and then parse them
	loadAllComments(() => {
	// Select only top-level comments (exclude those marked as reply)
	const topLevelComments = document.querySelectorAll(Selectors.TOP_LEVEL_COMMENTS);

	// Parse each top-level comment (recursively gathering replies)
	const results = Array.from(topLevelComments).map((article) => parseComment(article));

	// Print out the final structured data
	console.log(JSON.stringify(results, null, 2));

	// Trigger the download of the parsed comments as a JSON file
	downloadFile("linkedin_post.json", JSON.stringify(results, null, 2));
	});
	})();
	(function () {
	// 1) Define your custom selectors based on the snippet
	const Selectors = {
	THREAD_CONTAINER: ".x78zum5.xdt5ytf.x1iyjqo2.x1n2onr6 > .x78zum5.xdt5ytf",
	COMMENTER_NAME: ".x1lliihq.x193iq5w.x6ikm8r.x10wlt62.xlyipyv.xuxw1ft",
	COMMENT_TEXT:
	".x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x1ji0vk5.x18bv5gf.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x1i0vuye.xjohtrz.xo1l8bm.xp07o12.x1yc453h.xat24cr.xdj266r",
	LIKES_COUNT: ".x17qophe.x10l6tqk.x13vifvy",
	};

	// 2) Helper function to safely fetch text
	function parseElement(element, selector, defaultVal = "") {
	const el = element.querySelector(selector);
	return el ? el.innerText.trim() : defaultVal;
	}

	function parseElements(element, selector, onElement, onEmpty) {
	const elements = element.querySelectorAll(selector);
	return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
	}

	// 3) Parse a single top-level “comment” (thread)
	function parseComment(threadEl) {
	const name = parseElement(threadEl, Selectors.COMMENTER_NAME);
	const comment = parseElement(threadEl, Selectors.COMMENT_TEXT);
	const likes = parseElement(threadEl, Selectors.LIKES_COUNT, "0");
	const replies = parseElements(threadEl, ".x1a2a7pz.x1n2onr6", parseComment, []);

	// If you find real nested replies in the DOM, handle them here
	return {
	name,
	comment,
	likes,
	analysis: {},
	replies: replies,
	};
	}

	// 4) Infinite scroll until no more content loads
	function loadAllThreads(callback) {
	let lastHeight = 0;
	let sameHeightCount = 0;
	let attemptsLeft = 10;

	const interval = setInterval(() => {
	window.scrollTo(0, document.body.scrollHeight);

	const currentHeight = document.body.scrollHeight;
	if (currentHeight === lastHeight) {
	sameHeightCount++;
	} else {
	sameHeightCount = 0;
	lastHeight = currentHeight;
	}

	if (sameHeightCount > 2 \|\| --attemptsLeft <= 0) {
	clearInterval(interval);
	callback();
	}
	}, 3000);
	}

	// 5) Download JSON
	function downloadFile(filename, content) {
	const blob = new Blob([content], { type: "application/json" });
	const url = URL.createObjectURL(blob);
	const a = document.createElement("a");
	a.href = url;
	a.download = filename;
	document.body.appendChild(a);
	a.click();
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}

	// 6) Orchestrate
	loadAllThreads(() => {
	console.log("Done scrolling, now parsing...");

	const containers = document.querySelectorAll(Selectors.THREAD_CONTAINER);
	const data = Array.from(containers).map((el) => parseComment(el));

	console.log("Parsed data:", data);
	console.log(JSON.stringify(data, null, 2));

	// optional: trigger download
	downloadFile("threads_data.json", JSON.stringify(data, null, 2));
	});
	})();