Created
February 26, 2025 12:40
-
-
Save callmephil/07f4bed15335e72d4de723f79376d0b7 to your computer and use it in GitHub Desktop.
linkedIn and threads scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(function () { | |
const Selectors = { | |
COMMENTER_NAME: ".comments-comment-meta__description-title", | |
COMMENT_TEXT: ".comments-comment-item__main-content", | |
LIKES_COUNT: ".comments-comment-social-bar__reactions-count--cr span.v-align-middle", | |
REPLY_ELEMENTS: ":scope article.comments-comment-entity--reply", | |
TOP_LEVEL_COMMENTS: "article.comments-comment-entity:not(.comments-comment-entity--reply)", | |
LOAD_MORE_BUTTON: ".comments-comments-list__load-more-comments-button--cr", | |
}; | |
function parseElement(element, selector, onEmpty) { | |
const el = element.querySelector(selector); | |
return el ? el.innerText.trim() : onEmpty; | |
} | |
function parseElements(element, selector, onElement, onEmpty) { | |
const elements = element.querySelectorAll(selector); | |
return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty; | |
} | |
/** | |
* Recursively parse a comment (and any nested replies). | |
* @param {HTMLElement} commentEl | |
* @returns {Object} | |
*/ | |
function parseComment(commentEl) { | |
const name = parseElement(commentEl, Selectors.COMMENTER_NAME, ""); | |
const comment = parseElement(commentEl, Selectors.COMMENT_TEXT, ""); | |
const likes = parseElement(commentEl, Selectors.LIKES_COUNT, "0"); | |
const replies = parseElements( | |
commentEl, | |
Selectors.REPLY_ELEMENTS, | |
(replyEl) => parseComment(replyEl), | |
[] | |
); | |
return { | |
name, | |
comment, | |
likes, | |
// TODO: add an api that can analyze the sentiment of the comment | |
// sentiment, summary, usefulness score... | |
analysis: {}, | |
replies, | |
}; | |
} | |
/** | |
* Generate a file and trigger a download. | |
* @param {string} filename | |
* @param {string} content | |
*/ | |
function downloadFile(filename, content) { | |
const blob = new Blob([content], { type: "application/json" }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement("a"); | |
a.href = url; | |
a.download = filename; | |
document.body.appendChild(a); | |
a.click(); | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
} | |
/** | |
* Load all comments by clicking the "View More" button until it disappears. | |
* @param {Function} callback | |
*/ | |
function loadAllComments(callback) { | |
const interval = setInterval(() => { | |
const loadMoreButton = document.querySelector(Selectors.LOAD_MORE_BUTTON); | |
if (loadMoreButton) { | |
loadMoreButton.click(); | |
} else { | |
clearInterval(interval); | |
callback(); | |
} | |
}, 1000); | |
} | |
// Load all comments and then parse them | |
loadAllComments(() => { | |
// Select only top-level comments (exclude those marked as reply) | |
const topLevelComments = document.querySelectorAll(Selectors.TOP_LEVEL_COMMENTS); | |
// Parse each top-level comment (recursively gathering replies) | |
const results = Array.from(topLevelComments).map((article) => parseComment(article)); | |
// Print out the final structured data | |
console.log(JSON.stringify(results, null, 2)); | |
// Trigger the download of the parsed comments as a JSON file | |
downloadFile("linkedin_post.json", JSON.stringify(results, null, 2)); | |
}); | |
})(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(function () { | |
// 1) Define your custom selectors based on the snippet | |
const Selectors = { | |
THREAD_CONTAINER: ".x78zum5.xdt5ytf.x1iyjqo2.x1n2onr6 > .x78zum5.xdt5ytf", | |
COMMENTER_NAME: ".x1lliihq.x193iq5w.x6ikm8r.x10wlt62.xlyipyv.xuxw1ft", | |
COMMENT_TEXT: | |
".x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x1ji0vk5.x18bv5gf.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x1i0vuye.xjohtrz.xo1l8bm.xp07o12.x1yc453h.xat24cr.xdj266r", | |
LIKES_COUNT: ".x17qophe.x10l6tqk.x13vifvy", | |
}; | |
// 2) Helper function to safely fetch text | |
function parseElement(element, selector, defaultVal = "") { | |
const el = element.querySelector(selector); | |
return el ? el.innerText.trim() : defaultVal; | |
} | |
function parseElements(element, selector, onElement, onEmpty) { | |
const elements = element.querySelectorAll(selector); | |
return elements.length > 0 ? Array.from(elements).map(onElement) : onEmpty; | |
} | |
// 3) Parse a single top-level “comment” (thread) | |
function parseComment(threadEl) { | |
const name = parseElement(threadEl, Selectors.COMMENTER_NAME); | |
const comment = parseElement(threadEl, Selectors.COMMENT_TEXT); | |
const likes = parseElement(threadEl, Selectors.LIKES_COUNT, "0"); | |
const replies = parseElements(threadEl, ".x1a2a7pz.x1n2onr6", parseComment, []); | |
// If you find real nested replies in the DOM, handle them here | |
return { | |
name, | |
comment, | |
likes, | |
analysis: {}, | |
replies: replies, | |
}; | |
} | |
// 4) Infinite scroll until no more content loads | |
function loadAllThreads(callback) { | |
let lastHeight = 0; | |
let sameHeightCount = 0; | |
let attemptsLeft = 10; | |
const interval = setInterval(() => { | |
window.scrollTo(0, document.body.scrollHeight); | |
const currentHeight = document.body.scrollHeight; | |
if (currentHeight === lastHeight) { | |
sameHeightCount++; | |
} else { | |
sameHeightCount = 0; | |
lastHeight = currentHeight; | |
} | |
if (sameHeightCount > 2 || --attemptsLeft <= 0) { | |
clearInterval(interval); | |
callback(); | |
} | |
}, 3000); | |
} | |
// 5) Download JSON | |
function downloadFile(filename, content) { | |
const blob = new Blob([content], { type: "application/json" }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement("a"); | |
a.href = url; | |
a.download = filename; | |
document.body.appendChild(a); | |
a.click(); | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
} | |
// 6) Orchestrate | |
loadAllThreads(() => { | |
console.log("Done scrolling, now parsing..."); | |
const containers = document.querySelectorAll(Selectors.THREAD_CONTAINER); | |
const data = Array.from(containers).map((el) => parseComment(el)); | |
console.log("Parsed data:", data); | |
console.log(JSON.stringify(data, null, 2)); | |
// optional: trigger download | |
downloadFile("threads_data.json", JSON.stringify(data, null, 2)); | |
}); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment