Last active
June 28, 2020 17:55
-
-
Save pars3c/6541c7fa38748a244482f511733e7707 to your computer and use it in GitHub Desktop.
Comic Book Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Apify = require('apify'); | |
const fs = require('fs'); | |
const b2CloudStorage = require('b2-cloud-storage'); | |
async function backblazeUpload () { | |
return new Promise(resolve => { | |
const b2 = new b2CloudStorage({ | |
auth: { | |
accountId: '<account_id>', | |
applicationKey: '<application_key>' | |
} | |
}); | |
b2.authorize(function(err){ | |
if(err){ throw err; } | |
// this function wraps both a normal upload AND a large file upload | |
b2.uploadFile('./comics.csv', { | |
bucketId: '<bucket_id>', | |
fileName: 'comics.csv', // this is the object storage "key". Can include a full path | |
contentType: 'text/csv', | |
onUploadProgress: function(update){ | |
console.log(`Progress: ${update.percent}% (${update.bytesDispatched}/${update.bytesTotal}`); | |
} | |
}, function(err, results){ | |
console.log(err, results) | |
}); | |
}); | |
}); | |
} | |
Apify.main(async () => { | |
const requestQueue = await Apify.openRequestQueue(); | |
await requestQueue.addRequest({ url: 'http://nv3x2jozywh63fkohn5mwp2d73vasusjixn3im3ueof52fmbjsigw6ad.onion/series/' }); | |
const crawler = new Apify.PuppeteerCrawler({ | |
requestQueue, | |
launchPuppeteerOptions: { | |
proxyUrl: 'socks5://localhost:9050', | |
args: ['--no-sandbox', '--disable-setuid-sandbox'], | |
headless: true, | |
executablePath: '/usr/bin/chromium-browser', | |
useChrome: false | |
}, | |
handlePageFunction: async ({ request, page }) => { | |
console.log(`Processing ${request.url}...`); | |
// Get all the titles | |
const titles = await page.evaluate(() => Array.from( | |
document.querySelectorAll('h2 + ul li a'), element => element.textContent)); | |
// Append the title to "comics.csv" file | |
for (title of titles) { | |
fs.appendFile('comics.csv', `${title}\n`, function (err) { | |
if (err) throw err; | |
}); | |
} | |
}, | |
}); | |
// Run the crawler and wait for it to finish. | |
await crawler.run(); | |
await backblazeUpload(); | |
console.log('Crawler finished.'); | |
}); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment