Skip to content

Instantly share code, notes, and snippets.

@nicosh
Last active November 12, 2024 17:51
Show Gist options
  • Save nicosh/b4ef0d2392f78a241d600806c5328952 to your computer and use it in GitHub Desktop.
Save nicosh/b4ef0d2392f78a241d600806c5328952 to your computer and use it in GitHub Desktop.
Simple way to scrape linkedin company employees with puppeteer
// A simple and raw example on how to scrape company employees data.
// this script will save the ouput in a .json file.
// the script needs your cookies to login.
// For Educational Purposes Only :)
const puppeteer = require('puppeteer');
const fs = require('fs');
const cookies = require('./cookies.json');
const API_ENDPOINT = "https://www.linkedin.com/voyager/api/search/hits?"
const isAgencyUrl = url => url.includes("https://www.linkedin.com/company")
const ScrapePage = async(url)=>{
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setCookie(...cookies);
let result = []
// we intercept browser requests instead scrape each profile
page.on('response', async (response) => {
let requestUrl = response.url()
if(requestUrl.includes(API_ENDPOINT)){
response.text().then(async (textBody)=> {
let JsonResponse = JSON.parse(textBody)
result.push(JsonResponse.included)
})
}
})
await page.goto(url);
try{
let peopleurl = await page.evaluate(() => document.querySelector('[data-control-name="page_member_main_nav_people_tab"]').href);
// todo : add check url exists
await page.goto(peopleurl);
await page.evaluate(async () => {
await new Promise(async (resolve, reject) => {
let totalHeight = 0;
let distance = 600;
let timer = setInterval(async () => {
let scrollHeight = document.body.scrollHeight;
var matches = document.querySelectorAll(".artdeco-loader");
window.scrollBy(0, distance);
totalHeight += distance;
if(matches.length <= 0){
clearInterval(timer);
resolve();
}
}, Math.floor(Math.random() * (6000 - 3000 + 1)) + 3000
);
});
}).then(async r =>{
await browser.close();
}).catch(async e => {
await browser.close();
});
let data = JSON.stringify(result);
fs.writeFileSync('out.json', data);
await browser.close();
}catch(e){
await browser.close();
}
}
const init = (url)=>{
if(isAgencyUrl(url)){
ScrapePage(url);
}
}
init("https://www.linkedin.com/company/some-company-name")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment