-
-
Save Zonalds/92002cfbdf414ee884786d73bc33eda7 to your computer and use it in GitHub Desktop.
Simple way to scrape linkedin company employees with puppeteer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A simple and raw example on how to scrape company employees data. | |
// this script will save the ouput in a .json file. | |
// the script needs your cookies to login. | |
// For Educational Purposes Only :) | |
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
const cookies = require('./cookies.json'); | |
const API_ENDPOINT = "https://www.linkedin.com/voyager/api/search/hits?" | |
const isAgencyUrl = url => url.includes("https://www.linkedin.com/company") | |
const ScrapePage = async(url)=>{ | |
const browser = await puppeteer.launch({headless: false}); | |
const page = await browser.newPage(); | |
await page.setCookie(...cookies); | |
let result = [] | |
// we intercept browser requests instead scrape each profile | |
page.on('response', async (response) => { | |
let requestUrl = response.url() | |
if(requestUrl.includes(API_ENDPOINT)){ | |
response.text().then(async (textBody)=> { | |
let JsonResponse = JSON.parse(textBody) | |
result.push(JsonResponse.included) | |
}) | |
} | |
}) | |
await page.goto(url); | |
try{ | |
let peopleurl = await page.evaluate(() => document.querySelector('[data-control-name="page_member_main_nav_people_tab"]').href); | |
// todo : add check url exists | |
await page.goto(peopleurl); | |
await page.evaluate(async () => { | |
await new Promise(async (resolve, reject) => { | |
let totalHeight = 0; | |
let distance = 600; | |
let timer = setInterval(async () => { | |
let scrollHeight = document.body.scrollHeight; | |
var matches = document.querySelectorAll(".artdeco-loader"); | |
window.scrollBy(0, distance); | |
totalHeight += distance; | |
if(matches.length <= 0){ | |
clearInterval(timer); | |
resolve(); | |
} | |
}, Math.floor(Math.random() * (6000 - 3000 + 1)) + 3000 | |
); | |
}); | |
}).then(async r =>{ | |
await browser.close(); | |
}).catch(async e => { | |
await browser.close(); | |
}); | |
let data = JSON.stringify(result); | |
fs.writeFileSync('out.json', data); | |
await browser.close(); | |
}catch(e){ | |
await browser.close(); | |
} | |
} | |
const init = (url)=>{ | |
if(isAgencyUrl(url)){ | |
ScrapePage(url); | |
} | |
} | |
init("https://www.linkedin.com/company/some-company-name") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment