Created
January 13, 2021 20:54
-
-
Save Hyllesen/fa3716fa0e76c8bd38428b014f16c53a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require("puppeteer"); | |
const cheerio = require("cheerio"); | |
async function scrapeListings(page, url) { | |
await page.goto(url); | |
const currentUrl = await page.url(); | |
if (currentUrl !== url) { | |
console.log(currentUrl); | |
console.log(url); | |
console.log("We've reached the end!"); | |
} | |
const html = await page.content(); | |
const $ = cheerio.load(html); | |
const listings = $(".data-table__value a") | |
.map((index, element) => { | |
const title = $(element).text().trim(); | |
const job_url = $(element).attr("href"); | |
const base_url = "https://www.payscale.com"; | |
const url = base_url + job_url; | |
const company = "hsbc"; | |
const source = "payscale"; | |
return { company, title, source, url }; | |
}) | |
.get(); | |
return listings; | |
} | |
async function acceptCookies(page) { | |
const selector = ".privacy-modal__buttons > button"; | |
await page.click(selector); | |
await page.waitForSelector(selector, { hidden: true }); | |
} | |
async function main() { | |
const browser = await puppeteer.launch({ headless: false }); | |
let [page] = await browser.pages(); | |
await page.goto("https://www.payscale.com/research/UK/Employer=HSBC/Salary/"); | |
await acceptCookies(page); | |
const $ = cheerio.load(await page.content()); | |
for (let pageNumber = 1; pageNumber < 999; pageNumber++) { | |
const url = | |
"https://www.payscale.com/research/UK/Employer=HSBC/Salary/Page-" + | |
pageNumber; | |
const listings = await scrapeListings(page, url); | |
console.log(listings); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment