Skip to content

Instantly share code, notes, and snippets.

@pars3c
Last active June 28, 2020 17:55
Show Gist options
  • Save pars3c/6541c7fa38748a244482f511733e7707 to your computer and use it in GitHub Desktop.
Save pars3c/6541c7fa38748a244482f511733e7707 to your computer and use it in GitHub Desktop.
Comic Book Crawler
const Apify = require('apify');
const fs = require('fs');
const b2CloudStorage = require('b2-cloud-storage');
async function backblazeUpload () {
return new Promise(resolve => {
const b2 = new b2CloudStorage({
auth: {
accountId: '<account_id>',
applicationKey: '<application_key>'
}
});
b2.authorize(function(err){
if(err){ throw err; }
// this function wraps both a normal upload AND a large file upload
b2.uploadFile('./comics.csv', {
bucketId: '<bucket_id>',
fileName: 'comics.csv', // this is the object storage "key". Can include a full path
contentType: 'text/csv',
onUploadProgress: function(update){
console.log(`Progress: ${update.percent}% (${update.bytesDispatched}/${update.bytesTotal}`);
}
}, function(err, results){
console.log(err, results)
});
});
});
}
Apify.main(async () => {
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: 'http://nv3x2jozywh63fkohn5mwp2d73vasusjixn3im3ueof52fmbjsigw6ad.onion/series/' });
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
launchPuppeteerOptions: {
proxyUrl: 'socks5://localhost:9050',
args: ['--no-sandbox', '--disable-setuid-sandbox'],
headless: true,
executablePath: '/usr/bin/chromium-browser',
useChrome: false
},
handlePageFunction: async ({ request, page }) => {
console.log(`Processing ${request.url}...`);
// Get all the titles
const titles = await page.evaluate(() => Array.from(
document.querySelectorAll('h2 + ul li a'), element => element.textContent));
// Append the title to "comics.csv" file
for (title of titles) {
fs.appendFile('comics.csv', `${title}\n`, function (err) {
if (err) throw err;
});
}
},
});
// Run the crawler and wait for it to finish.
await crawler.run();
await backblazeUpload();
console.log('Crawler finished.');
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment