Last active
March 4, 2020 12:14
-
-
Save VitalyKondratiev/5d60c59070c04c16355a7060365e716c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dl_require(['puppeteer', 'fs', 'fast-csv']).then(_modules => { | |
const puppeteer = _modules.puppeteer; | |
const fs = _modules.fs; | |
const fast_csv = _modules['fast-csv']; | |
let main_url = 'https://example.com/'; | |
let parsed = []; | |
let parsed_pages = []; | |
let need_parse = [main_url]; | |
let browser = null; | |
let save_csv = function () { | |
let csv_structure = []; | |
parsed_pages.forEach((page) => { | |
csv_structure.push([page]); | |
}); | |
const stream = fs.createWriteStream('pages.csv'); | |
fast_csv.write(csv_structure, { headers: false }).pipe(stream); | |
} | |
let parse = async function () { | |
if (need_parse.length == 0) { | |
return; | |
} | |
if (browser == null) { | |
browser = await puppeteer.launch({ | |
args: ['--no-sandbox'], | |
headless: true, | |
}); | |
} | |
const page = await browser.newPage(); | |
let pageURL = need_parse[0]; | |
pageURL = pageURL.replace(/\/$/, "") | |
need_parse.splice(need_parse.indexOf(pageURL), 1); | |
parsed.push(pageURL); | |
let log_message = null; | |
try { | |
let response = await page.goto(pageURL, {waitUntil: 'networkidle0'}); | |
if (response.status() == 200 && response.headers()['content-type'].startsWith('text/html')) { | |
parsed_pages.push(pageURL); | |
log_message = `${pageURL} (OK: HTML page`; | |
let hrefs = await page.evaluate(() => { | |
const links = Array.from(document.querySelectorAll('a[href]')); | |
return links.map(link => link.href); | |
}); | |
hrefs = [...new Set(hrefs)].filter((href) => { | |
let matches = new URL(href).host.match(new RegExp(`^(?:www.)?${new URL(main_url).host}$`)); | |
return matches != null && matches.length; | |
}); | |
hrefs.forEach((href) => { | |
let url = `${(new URL(href)).protocol}//${(new URL(href)).host}${(new URL(href)).pathname}`.replace(/\/$/, ""); | |
if (!need_parse.includes(url) && !parsed.includes(url)) { | |
need_parse.push(url); | |
} | |
}); | |
} | |
else { | |
log_message = `${pageURL} (ERRROR: ${response.headers()['content-type']}, ${response.status()}`; | |
} | |
} | |
catch { | |
log_message = `${pageURL} (ERROR: mismatch content-type`; | |
} | |
finally { | |
await page.goto('about:blank'); | |
await page.close(); | |
let state = { | |
'main_url': main_url, | |
'parsed_pages': parsed_pages, | |
'parsed': parsed, | |
'need_parse': need_parse, | |
} | |
fs.writeFile("state.json", JSON.stringify(state, null, 4), () => {}); | |
console.log(`${log_message}, left: ${need_parse.length})`) | |
if (need_parse.length) { | |
parse(); | |
} | |
else { | |
console.log(`Parsed with ${parsed_pages.length} pages`); | |
await browser.close(); | |
} | |
save_csv(); | |
} | |
}; | |
fs.exists('state.json', function(exists){ | |
if (exists) { | |
fs.readFile('state.json', (err, data) => { | |
let state = JSON.parse(data); | |
main_url = state.main_url; | |
parsed_pages = state.parsed_pages; | |
parsed = state.parsed; | |
need_parse = state.need_parse; | |
parse(); | |
}); | |
} | |
else { | |
parse(); | |
} | |
}); | |
}); | |
function dl_require(module_names) { | |
return new Promise(function (resolve, reject) { | |
var modules = {}; | |
var missing_modules = []; | |
module_names.forEach(module_name => { | |
try { | |
modules[module_name] = require(module_name); | |
} | |
catch (ex) { | |
missing_modules.push(module_name) | |
} | |
}); | |
if (missing_modules.length) { | |
var child = require('child_process').exec('npm i ' + missing_modules.join(' '), { stdio: [0, 1, 2] }, function (err, stdout, stderr) { | |
if (err) | |
throw (stderr); | |
missing_modules.forEach(module_name => { | |
modules[module_name] = require(module_name); | |
}); | |
resolve(modules); | |
}); | |
child.stdout.on('data', function (data) { | |
console.log(data.toString()); | |
}); | |
} | |
else { | |
resolve(modules); | |
} | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment