Last active
November 12, 2024 17:51
-
-
Save nicosh/b4ef0d2392f78a241d600806c5328952 to your computer and use it in GitHub Desktop.
Simple way to scrape linkedin company employees with puppeteer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A simple and raw example on how to scrape company employees data. | |
// this script will save the ouput in a .json file. | |
// the script needs your cookies to login. | |
// For Educational Purposes Only :) | |
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
const cookies = require('./cookies.json'); | |
const API_ENDPOINT = "https://www.linkedin.com/voyager/api/search/hits?" | |
const isAgencyUrl = url => url.includes("https://www.linkedin.com/company") | |
const ScrapePage = async(url)=>{ | |
const browser = await puppeteer.launch({headless: false}); | |
const page = await browser.newPage(); | |
await page.setCookie(...cookies); | |
let result = [] | |
// we intercept browser requests instead scrape each profile | |
page.on('response', async (response) => { | |
let requestUrl = response.url() | |
if(requestUrl.includes(API_ENDPOINT)){ | |
response.text().then(async (textBody)=> { | |
let JsonResponse = JSON.parse(textBody) | |
result.push(JsonResponse.included) | |
}) | |
} | |
}) | |
await page.goto(url); | |
try{ | |
let peopleurl = await page.evaluate(() => document.querySelector('[data-control-name="page_member_main_nav_people_tab"]').href); | |
// todo : add check url exists | |
await page.goto(peopleurl); | |
await page.evaluate(async () => { | |
await new Promise(async (resolve, reject) => { | |
let totalHeight = 0; | |
let distance = 600; | |
let timer = setInterval(async () => { | |
let scrollHeight = document.body.scrollHeight; | |
var matches = document.querySelectorAll(".artdeco-loader"); | |
window.scrollBy(0, distance); | |
totalHeight += distance; | |
if(matches.length <= 0){ | |
clearInterval(timer); | |
resolve(); | |
} | |
}, Math.floor(Math.random() * (6000 - 3000 + 1)) + 3000 | |
); | |
}); | |
}).then(async r =>{ | |
await browser.close(); | |
}).catch(async e => { | |
await browser.close(); | |
}); | |
let data = JSON.stringify(result); | |
fs.writeFileSync('out.json', data); | |
await browser.close(); | |
}catch(e){ | |
await browser.close(); | |
} | |
} | |
const init = (url)=>{ | |
if(isAgencyUrl(url)){ | |
ScrapePage(url); | |
} | |
} | |
init("https://www.linkedin.com/company/some-company-name") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment