Last active
July 28, 2019 16:21
-
-
Save huttj/db959201697a7e34e745efbcc8c7348c to your computer and use it in GitHub Desktop.
Scrape a site and capture all of the content (and images)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); // v 1.1.0 | |
const { URL } = require('url'); | |
const fse = require('fs-extra'); // v 5.0.0 | |
const path = require('path'); | |
const NO_RESPONSE_ERROR_MESSAGE = 'No data found for resource with given identifier'; | |
const NAME_TOO_LONG_ERROR_MESSAGE = 'ENAMETOOLONG'; | |
const THIRTY_SECONDS_MS = 1000 * 30; | |
const visited = new Set(); | |
const queued = new Set(); | |
const queue = []; | |
const whitelist = new Set(); | |
let currentUrl = null; | |
let currentTask = Promise.resolve(); | |
async function start(urlToFetch) { | |
addToWhitelist(urlToFetch); | |
await fse.remove('./output'); | |
/* 1 */ | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
/* 2 */ | |
page.on('response', async (response) => { | |
logFile(response); | |
const filePath = getFilePath(response); | |
try { | |
const data = await getData(response); | |
await fse.outputFile(filePath, data); | |
} catch (e) { | |
if (e.message.includes(NAME_TOO_LONG_ERROR_MESSAGE) || e.message.includes(NO_RESPONSE_ERROR_MESSAGE)) { | |
// Do nothing | |
} else { | |
console.warn(e.message); | |
} | |
} | |
}); | |
page.on('load', async () => { | |
// Don't cut off a page that's loading | |
await currentTask; | |
console.log(`Loaded page: ${page.url()}`); | |
fse.outputFile('./data.json', JSON.stringify({ | |
visited: Array.from(visited), | |
queue, | |
}, null, 2)); | |
await enqueueLinks(page); | |
goToNextPage(page); | |
}); | |
/* 3 */ | |
await page.goto(urlToFetch, { | |
waitUntil: 'networkidle2' | |
}); | |
// /* 4 */ | |
// setTimeout(async () => browser.close(), 60000 * 4); | |
} | |
function addToWhitelist(url) { | |
const { hostname } = new URL(url); | |
whitelist.add(hostname); | |
if (!hostname.match(/^www/)) { | |
whitelist.add('www.' + hostname); | |
} | |
} | |
function logFile(response) { | |
const type = response._request._resourceType; | |
const url = response.url(); | |
// console.log(`Loading ${type}: ${url}`); | |
} | |
function getFilePath(response) { | |
const url = new URL(response.url()); | |
let filePath = path.resolve(`./output${url.pathname}`); | |
if (path.extname(url.pathname).trim() === '') { | |
filePath = `${filePath}/index.html`; | |
} | |
return filePath; | |
} | |
function addToQueue(rawUrl) { | |
if (!queued.has(rawUrl)) { | |
const url = new URL(rawUrl) | |
if (whitelist.has(url.hostname)) { | |
queued.add(rawUrl); | |
queue.push(rawUrl); | |
return true; | |
} | |
} | |
return false; | |
} | |
async function getData(response) { | |
if (response._request._resourceType === 'document') { | |
const data = await response.text(); | |
return data | |
.replace(/href="(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
if (whitelist.has(url.hostname)) { | |
addToQueue(capture); | |
return `href="${url.pathname}"`; | |
} | |
return `href="${capture}"`; | |
}) | |
.replace(/src="(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
return `src="${url.pathname}"`; | |
}) | |
.replace(/image="(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
return `image="${url.pathname}"`; | |
}) | |
.replace(/poster="(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
return `poster="${url.pathname}"`; | |
}) | |
.replace(/content="(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
return `content="${url.pathname}"`; | |
}) | |
.replace(/"image":"(https?:\/\/[^"]+)"/g, (_, capture) => { | |
const url = new URL(capture); | |
return `image:"${url.pathname}"`; | |
}) | |
.replace(/"url":"(https?:\\\/\\\/[^"]+)"/g, (_, capture) => { | |
const rawUrl = capture.replace(/\\\//g, '/') | |
addToQueue(rawUrl); | |
const url = new URL(rawUrl); | |
return `url:"${url.pathname.replace(/\//g, '\\\/')}"`; | |
}) | |
.replace(/background-image:\s*url\(['"]([^'"]+)['"]\)/g, (_, capture) => { | |
const url = new URL(capture); | |
return `background-image: url('${url.pathname}')`; | |
}); | |
} | |
return response.buffer(); | |
} | |
async function enqueueLinks(page) { | |
// Skip non-html docs (images, etc); | |
const type = await page.evaluate(() => document.contentType); | |
if (type !== 'text/html') return; | |
const links = await page.evaluate(() => { | |
return Array.from(document.querySelectorAll('*')).reduce((links, el) => { | |
if (!el.href) return links; | |
return links.concat(el.href); | |
}, []); | |
}); | |
let count = 0; | |
for (const link of links) { | |
if (addToQueue(link)) { | |
count++; | |
} | |
} | |
console.log(`Added ${count} urls`); | |
} | |
function getNextUnvisited() { | |
let next; | |
while ((!next || visited.has(next)) && queue.length) { | |
next = queue.shift(); | |
} | |
return visited.has(next) ? null : next; | |
} | |
function sleep(ms) { | |
return new Promise(ok => setTimeout(ok, ms)); | |
} | |
async function goToWithTimeout(page, next) { | |
try { | |
const result = await Promise.race([ | |
page.goto(next, { waitUntil: 'networkidle0' }), | |
sleep(THIRTY_SECONDS_MS) | |
]); | |
const reason = result ? '' : ' - timed out'; | |
console.log(`Done with ${next}${reason}`); | |
} catch (e) { | |
console.warn(`Failed ${next}: ${e.message}`); | |
} | |
} | |
async function goToNextPage(page) { | |
const next = getNextUnvisited(); | |
if (next) { | |
visited.add(next); | |
currentUrl = next; | |
currentTask = goToWithTimeout(page, next); | |
await currentTask; | |
// Nothing else got queued up | |
await sleep(THIRTY_SECONDS_MS); | |
if (currentUrl === next) { | |
goToNextPage(page); | |
} | |
} else { | |
console.log('Done...?'); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Apparently, Puppeteer won't fire a
load
event if you hit a page a second time. So, rather than try to figure out the right way to do it, I just added a timeout togoToNextPage()
. Not great, but it's working well.