Last active
February 12, 2019 16:38
-
-
Save thayton/0c703f7a18ae1fe2d90db04910ff1434 to your computer and use it in GitHub Desktop.
Puppeteer scraper for https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
node_modules/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Companion code for article at http://toddhayton.com/... | |
* | |
* Setup: | |
* $ mkdir scraper/ | |
* $ cd scraper/ | |
* $ npm init -y | |
* $ npm install puppeteer --save | |
* | |
* Usage: | |
* $ node l3com_scraper.js | |
*/ | |
const puppeteer = require('puppeteer'); | |
const url = 'https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl'; | |
/* | |
* The text inside the reload message span changes once the next page of jobs | |
* have been loaded. For the first page of jobs, the span starts out as empty. | |
*/ | |
var waitForJobsToLoad = (function () { | |
let reloadMessage = ''; | |
return async function(page) { | |
await page.waitForFunction( | |
oldText => document.querySelector('span#reloadMessage').innerText !== oldText, | |
{}, reloadMessage | |
); | |
reloadMessage = await page.$eval('span#reloadMessage', e => e.innerText); | |
}; | |
})(); | |
/* Alternatively we could wait for the progress indicator to appear/disappear | |
* | |
async function waitForJobsToLoad(page) { | |
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth !== 0); | |
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth === 0); | |
} | |
*/ | |
/*------------------------------------------------------------------------------ | |
* Look for link for pageno in pager. So if pageno was 6 we'd look for 'Page$6' | |
* in href: | |
* | |
* <a href="#" title="Go to page 6" aria-disabled="false">6</a> | |
*/ | |
async function gotoNextPage(page, pageno) { | |
let noMorePages = true; | |
let nextPageXp = `//ul[@class='pager']/li[@class='pagerlink']/a[text()='${pageno}']`; | |
let nextPage; | |
nextPage = await page.$x(nextPageXp) | |
if (nextPage.length > 0) { | |
await nextPage[0].click(); | |
await waitForJobsToLoad(page); | |
noMorePages = false; | |
} | |
return noMorePages; | |
} | |
async function getJobs(page) { | |
const jobs = await page.evaluate(jobSelector => { | |
//debugger; | |
var results = []; | |
Array.from(document.querySelectorAll(jobSelector)).forEach((tr) => { | |
th = tr.querySelector('th'); | |
td = tr.querySelectorAll('td'); | |
results.push({ | |
'title': th.innerText.trim(), | |
'href': th.querySelector('a').href, | |
'location': td[1].innerText.trim(), | |
'postingDate': td[2].innerText.trim() | |
}); | |
}); | |
return results; | |
}, 'table#jobs tr[id^="job"]'); | |
return jobs; | |
} | |
async function main() { | |
//const browser = await puppeteer.launch({ slowMo: 250, headless: false, devtools: true }); | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto(url); | |
await waitForJobsToLoad(page); | |
let pageno = 2; | |
while (true) { | |
console.log(`Getting jobs on page ${pageno - 1}`); | |
const jobs = await getJobs(page); | |
jobs.forEach(j => console.log( JSON.stringify(j, null, 2) )); | |
const noMorePages = await gotoNextPage(page, pageno++); | |
if (noMorePages) { | |
break; | |
} | |
/* Don't hit the server too quickly... */ | |
await page.waitFor(1000); | |
} | |
await browser.close(); | |
} | |
main().then(() => console.log('Complete!')); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "l3com_scraper", | |
"version": "1.0.0", | |
"lockfileVersion": 1, | |
"requires": true, | |
"dependencies": { | |
"agent-base": { | |
"version": "4.2.1", | |
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.2.1.tgz", | |
"integrity": "sha512-JVwXMr9nHYTUXsBFKUqhJwvlcYU/blreOEUkhNR2eXZIvwd+c+o5V4MgDPKWnMS/56awN3TRzIP+KoPn+roQtg==", | |
"requires": { | |
"es6-promisify": "5.0.0" | |
} | |
}, | |
"async-limiter": { | |
"version": "1.0.0", | |
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz", | |
"integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg==" | |
}, | |
"balanced-match": { | |
"version": "1.0.0", | |
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", | |
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" | |
}, | |
"brace-expansion": { | |
"version": "1.1.11", | |
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", | |
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", | |
"requires": { | |
"balanced-match": "1.0.0", | |
"concat-map": "0.0.1" | |
} | |
}, | |
"buffer-from": { | |
"version": "1.1.1", | |
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", | |
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" | |
}, | |
"concat-map": { | |
"version": "0.0.1", | |
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", | |
"integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" | |
}, | |
"concat-stream": { | |
"version": "1.6.2", | |
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", | |
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", | |
"requires": { | |
"buffer-from": "1.1.1", | |
"inherits": "2.0.3", | |
"readable-stream": "2.3.6", | |
"typedarray": "0.0.6" | |
} | |
}, | |
"core-util-is": { | |
"version": "1.0.2", | |
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", | |
"integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" | |
}, | |
"debug": { | |
"version": "3.1.0", | |
"resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", | |
"integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", | |
"requires": { | |
"ms": "2.0.0" | |
} | |
}, | |
"es6-promise": { | |
"version": "4.2.4", | |
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.4.tgz", | |
"integrity": "sha512-/NdNZVJg+uZgtm9eS3O6lrOLYmQag2DjdEXuPaHlZ6RuVqgqaVZfgYCepEIKsLqwdQArOPtC3XzRLqGGfT8KQQ==" | |
}, | |
"es6-promisify": { | |
"version": "5.0.0", | |
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", | |
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", | |
"requires": { | |
"es6-promise": "4.2.4" | |
} | |
}, | |
"extract-zip": { | |
"version": "1.6.7", | |
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz", | |
"integrity": "sha1-qEC0uK9kAyZMjbV/Txp0Mz74H+k=", | |
"requires": { | |
"concat-stream": "1.6.2", | |
"debug": "2.6.9", | |
"mkdirp": "0.5.1", | |
"yauzl": "2.4.1" | |
}, | |
"dependencies": { | |
"debug": { | |
"version": "2.6.9", | |
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", | |
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", | |
"requires": { | |
"ms": "2.0.0" | |
} | |
} | |
} | |
}, | |
"fd-slicer": { | |
"version": "1.0.1", | |
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz", | |
"integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=", | |
"requires": { | |
"pend": "1.2.0" | |
} | |
}, | |
"fs.realpath": { | |
"version": "1.0.0", | |
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", | |
"integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" | |
}, | |
"glob": { | |
"version": "7.1.3", | |
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", | |
"integrity": "sha512-vcfuiIxogLV4DlGBHIUOwI0IbrJ8HWPc4MU7HzviGeNho/UJDfi6B5p3sHeWIQ0KGIU0Jpxi5ZHxemQfLkkAwQ==", | |
"requires": { | |
"fs.realpath": "1.0.0", | |
"inflight": "1.0.6", | |
"inherits": "2.0.3", | |
"minimatch": "3.0.4", | |
"once": "1.4.0", | |
"path-is-absolute": "1.0.1" | |
} | |
}, | |
"https-proxy-agent": { | |
"version": "2.2.1", | |
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz", | |
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==", | |
"requires": { | |
"agent-base": "4.2.1", | |
"debug": "3.1.0" | |
} | |
}, | |
"inflight": { | |
"version": "1.0.6", | |
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", | |
"integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", | |
"requires": { | |
"once": "1.4.0", | |
"wrappy": "1.0.2" | |
} | |
}, | |
"inherits": { | |
"version": "2.0.3", | |
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", | |
"integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" | |
}, | |
"isarray": { | |
"version": "1.0.0", | |
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", | |
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" | |
}, | |
"mime": { | |
"version": "2.3.1", | |
"resolved": "https://registry.npmjs.org/mime/-/mime-2.3.1.tgz", | |
"integrity": "sha512-OEUllcVoydBHGN1z84yfQDimn58pZNNNXgZlHXSboxMlFvgI6MXSWpWKpFRra7H1HxpVhHTkrghfRW49k6yjeg==" | |
}, | |
"minimatch": { | |
"version": "3.0.4", | |
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", | |
"integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", | |
"requires": { | |
"brace-expansion": "1.1.11" | |
} | |
}, | |
"minimist": { | |
"version": "0.0.8", | |
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", | |
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" | |
}, | |
"mkdirp": { | |
"version": "0.5.1", | |
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", | |
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", | |
"requires": { | |
"minimist": "0.0.8" | |
} | |
}, | |
"ms": { | |
"version": "2.0.0", | |
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", | |
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" | |
}, | |
"once": { | |
"version": "1.4.0", | |
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", | |
"integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", | |
"requires": { | |
"wrappy": "1.0.2" | |
} | |
}, | |
"path-is-absolute": { | |
"version": "1.0.1", | |
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", | |
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" | |
}, | |
"pend": { | |
"version": "1.2.0", | |
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", | |
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" | |
}, | |
"process-nextick-args": { | |
"version": "2.0.0", | |
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", | |
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" | |
}, | |
"progress": { | |
"version": "2.0.0", | |
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.0.tgz", | |
"integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8=" | |
}, | |
"proxy-from-env": { | |
"version": "1.0.0", | |
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", | |
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=" | |
}, | |
"puppeteer": { | |
"version": "1.7.0", | |
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.7.0.tgz", | |
"integrity": "sha512-f+1DxKHPqce6CXUBz2eVO2WcATeVeQSOPG9GYaGObEZDCiCEUwG+gogjMsrvn7he2wHTqNVb5p6RUrwmr8XFBA==", | |
"requires": { | |
"debug": "3.1.0", | |
"extract-zip": "1.6.7", | |
"https-proxy-agent": "2.2.1", | |
"mime": "2.3.1", | |
"progress": "2.0.0", | |
"proxy-from-env": "1.0.0", | |
"rimraf": "2.6.2", | |
"ws": "5.2.2" | |
} | |
}, | |
"readable-stream": { | |
"version": "2.3.6", | |
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", | |
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", | |
"requires": { | |
"core-util-is": "1.0.2", | |
"inherits": "2.0.3", | |
"isarray": "1.0.0", | |
"process-nextick-args": "2.0.0", | |
"safe-buffer": "5.1.2", | |
"string_decoder": "1.1.1", | |
"util-deprecate": "1.0.2" | |
} | |
}, | |
"rimraf": { | |
"version": "2.6.2", | |
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", | |
"integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", | |
"requires": { | |
"glob": "7.1.3" | |
} | |
}, | |
"safe-buffer": { | |
"version": "5.1.2", | |
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", | |
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" | |
}, | |
"string_decoder": { | |
"version": "1.1.1", | |
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", | |
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", | |
"requires": { | |
"safe-buffer": "5.1.2" | |
} | |
}, | |
"typedarray": { | |
"version": "0.0.6", | |
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", | |
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" | |
}, | |
"util-deprecate": { | |
"version": "1.0.2", | |
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", | |
"integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" | |
}, | |
"wrappy": { | |
"version": "1.0.2", | |
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", | |
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" | |
}, | |
"ws": { | |
"version": "5.2.2", | |
"resolved": "https://registry.npmjs.org/ws/-/ws-5.2.2.tgz", | |
"integrity": "sha512-jaHFD6PFv6UgoIVda6qZllptQsMlDEJkTQcybzzXDYM1XO9Y8em691FGMPmM46WGyLU4z9KMgQN+qrux/nhlHA==", | |
"requires": { | |
"async-limiter": "1.0.0" | |
} | |
}, | |
"yauzl": { | |
"version": "2.4.1", | |
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.4.1.tgz", | |
"integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=", | |
"requires": { | |
"fd-slicer": "1.0.1" | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "l3com_scraper", | |
"version": "1.0.0", | |
"description": "", | |
"main": "l3com_scraper.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"repository": { | |
"type": "git", | |
"url": "git+ssh://[email protected]/0c703f7a18ae1fe2d90db04910ff1434.git" | |
}, | |
"keywords": [], | |
"author": "", | |
"license": "ISC", | |
"bugs": { | |
"url": "https://gist.github.com/0c703f7a18ae1fe2d90db04910ff1434" | |
}, | |
"homepage": "https://gist.github.com/0c703f7a18ae1fe2d90db04910ff1434", | |
"dependencies": { | |
"puppeteer": "^1.7.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment