Skip to content

Instantly share code, notes, and snippets.

@callmephil
Created February 26, 2025 12:40
Show Gist options
  • Save callmephil/07f4bed15335e72d4de723f79376d0b7 to your computer and use it in GitHub Desktop.
Save callmephil/07f4bed15335e72d4de723f79376d0b7 to your computer and use it in GitHub Desktop.
linkedIn and threads scrapper
(function () {
const Selectors = {
COMMENTER_NAME: ".comments-comment-meta__description-title",
COMMENT_TEXT: ".comments-comment-item__main-content",
LIKES_COUNT: ".comments-comment-social-bar__reactions-count--cr span.v-align-middle",
REPLY_ELEMENTS: ":scope article.comments-comment-entity--reply",
TOP_LEVEL_COMMENTS: "article.comments-comment-entity:not(.comments-comment-entity--reply)",
LOAD_MORE_BUTTON: ".comments-comments-list__load-more-comments-button--cr",
};
function parseElement(element, selector, onEmpty) {
const el = element.querySelector(selector);
return el ? el.innerText.trim() : onEmpty;
}
function parseElements(element, selector, onElement, onEmpty) {
const elements = element.querySelectorAll(selector);
return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
}
/**
* Recursively parse a comment (and any nested replies).
* @param {HTMLElement} commentEl
* @returns {Object}
*/
function parseComment(commentEl) {
const name = parseElement(commentEl, Selectors.COMMENTER_NAME, "");
const comment = parseElement(commentEl, Selectors.COMMENT_TEXT, "");
const likes = parseElement(commentEl, Selectors.LIKES_COUNT, "0");
const replies = parseElements(
commentEl,
Selectors.REPLY_ELEMENTS,
(replyEl) => parseComment(replyEl),
[]
);
return {
name,
comment,
likes,
// TODO: add an api that can analyze the sentiment of the comment
// sentiment, summary, usefulness score...
analysis: {},
replies,
};
}
/**
* Generate a file and trigger a download.
* @param {string} filename
* @param {string} content
*/
function downloadFile(filename, content) {
const blob = new Blob([content], { type: "application/json" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
/**
* Load all comments by clicking the "View More" button until it disappears.
* @param {Function} callback
*/
function loadAllComments(callback) {
const interval = setInterval(() => {
const loadMoreButton = document.querySelector(Selectors.LOAD_MORE_BUTTON);
if (loadMoreButton) {
loadMoreButton.click();
} else {
clearInterval(interval);
callback();
}
}, 1000);
}
// Load all comments and then parse them
loadAllComments(() => {
// Select only top-level comments (exclude those marked as reply)
const topLevelComments = document.querySelectorAll(Selectors.TOP_LEVEL_COMMENTS);
// Parse each top-level comment (recursively gathering replies)
const results = Array.from(topLevelComments).map((article) => parseComment(article));
// Print out the final structured data
console.log(JSON.stringify(results, null, 2));
// Trigger the download of the parsed comments as a JSON file
downloadFile("linkedin_post.json", JSON.stringify(results, null, 2));
});
})();
(function () {
// 1) Define your custom selectors based on the snippet
const Selectors = {
THREAD_CONTAINER: ".x78zum5.xdt5ytf.x1iyjqo2.x1n2onr6 > .x78zum5.xdt5ytf",
COMMENTER_NAME: ".x1lliihq.x193iq5w.x6ikm8r.x10wlt62.xlyipyv.xuxw1ft",
COMMENT_TEXT:
".x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x1ji0vk5.x18bv5gf.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x1i0vuye.xjohtrz.xo1l8bm.xp07o12.x1yc453h.xat24cr.xdj266r",
LIKES_COUNT: ".x17qophe.x10l6tqk.x13vifvy",
};
// 2) Helper function to safely fetch text
function parseElement(element, selector, defaultVal = "") {
const el = element.querySelector(selector);
return el ? el.innerText.trim() : defaultVal;
}
function parseElements(element, selector, onElement, onEmpty) {
const elements = element.querySelectorAll(selector);
return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty;
}
// 3) Parse a single top-level “comment” (thread)
function parseComment(threadEl) {
const name = parseElement(threadEl, Selectors.COMMENTER_NAME);
const comment = parseElement(threadEl, Selectors.COMMENT_TEXT);
const likes = parseElement(threadEl, Selectors.LIKES_COUNT, "0");
const replies = parseElements(threadEl, ".x1a2a7pz.x1n2onr6", parseComment, []);
// If you find real nested replies in the DOM, handle them here
return {
name,
comment,
likes,
analysis: {},
replies: replies,
};
}
// 4) Infinite scroll until no more content loads
function loadAllThreads(callback) {
let lastHeight = 0;
let sameHeightCount = 0;
let attemptsLeft = 10;
const interval = setInterval(() => {
window.scrollTo(0, document.body.scrollHeight);
const currentHeight = document.body.scrollHeight;
if (currentHeight === lastHeight) {
sameHeightCount++;
} else {
sameHeightCount = 0;
lastHeight = currentHeight;
}
if (sameHeightCount > 2 || --attemptsLeft <= 0) {
clearInterval(interval);
callback();
}
}, 3000);
}
// 5) Download JSON
function downloadFile(filename, content) {
const blob = new Blob([content], { type: "application/json" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
// 6) Orchestrate
loadAllThreads(() => {
console.log("Done scrolling, now parsing...");
const containers = document.querySelectorAll(Selectors.THREAD_CONTAINER);
const data = Array.from(containers).map((el) => parseComment(el));
console.log("Parsed data:", data);
console.log(JSON.stringify(data, null, 2));
// optional: trigger download
downloadFile("threads_data.json", JSON.stringify(data, null, 2));
});
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment