|
const puppeteer = require('puppeteer'); |
|
const fs = require('fs'); |
|
|
|
const proxy = process.env.PROXY; |
|
const BATCH_SIZE = 64; |
|
const TIMEOUT = 5*1000; |
|
const USER_AGENT = process.env.USER_AGENT; |
|
const BLOCK = ['.jpg', '.png', '.gif', '.css', '.js']; |
|
|
|
async function getBrowser() { |
|
return puppeteer.launch({ |
|
args: |
|
proxy ? ['--proxy-server=' + proxy,] : [] |
|
}); |
|
} |
|
|
|
function log(msg) { |
|
process.stderr.write(msg + '\n'); |
|
} |
|
|
|
async function preparePage(ctx) { |
|
const page = await ctx.newPage(); |
|
if (USER_AGENT) await page.setUserAgent(USER_AGENT); |
|
await page.setJavaScriptEnabled(false); |
|
await page.setRequestInterception(true); |
|
page.on('request', req => { |
|
if (BLOCK.some(x=>req.url().endsWith(x))) req.abort(); |
|
else req.continue(); |
|
}); |
|
return page; |
|
} |
|
|
|
async function fetchUrl(ctx, url) { |
|
let charset = 'unknown'; |
|
let page = null; |
|
try { |
|
page = await preparePage(ctx); |
|
await page.goto(url, { |
|
timeout: TIMEOUT, |
|
waitUntil: 'domcontentloaded', |
|
}); |
|
charset = await page.evaluate('document.characterSet'); |
|
} catch (err) { |
|
log(`${url} : ${err}`); |
|
} |
|
if(page) await page.close(); |
|
return {url, charset}; |
|
} |
|
|
|
function* getBatch(ctx, urls) { |
|
yield* urls.map(url => fetchUrl(ctx, url)); |
|
} |
|
|
|
function* makeBatches(list, size) { |
|
const ret = []; |
|
for(const elem of list) { |
|
ret.push(elem); |
|
if(ret.length == size) { |
|
yield ret; |
|
ret.length = 0; |
|
} |
|
} |
|
if (ret.length > 0) yield ret; |
|
} |
|
|
|
// read an array of non-empty strings |
|
function readList(file=0) { |
|
return new Promise((resolve, fail) => { |
|
fs.readFile(0, 'utf8', (err, str) => { |
|
if(err) return fail(err); |
|
lines = str.split('\n').filter(x=>x); |
|
resolve(lines); |
|
}); |
|
}); |
|
} |
|
|
|
function printResult(res) { |
|
console.log(JSON.stringify(res)); |
|
} |
|
|
|
async function main() { |
|
const browserP = getBrowser(); |
|
const urlsP = readList(); |
|
const urls = await urlsP; |
|
const browser = await browserP; |
|
let done = 0; |
|
let toDo = urls.length; |
|
for(const batch of makeBatches(urls, BATCH_SIZE)) { |
|
const ctx = await browser.createIncognitoBrowserContext(); |
|
const results = getBatch(ctx, batch); |
|
for(const res of results) { |
|
printResult(await res); |
|
done++; |
|
log(`${done}/${toDo} : ${100*done/toDo|0}%`); |
|
} |
|
let pages = await browser.pages(); |
|
log(`${pages.length} opened pages: ${pages.map(p=>p.url())}`); |
|
await ctx.close(); |
|
} |
|
await browser.close(); |
|
log('done'); |
|
} |
|
|
|
main(); |