Last active
April 25, 2020 16:26
-
-
Save iexa/5ec548fe85d902ab653fafd11a254fbd to your computer and use it in GitHub Desktop.
mass files downloader using node.js + json file. {also for es6 concepts}
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// dl.js script, dl.json is {'folder_name1': [url1, url2, url3, ...], 'folder_name2': [...], ...} | |
// | |
// 1st "major" mod - now uses async "threads" to download several files at once. m.o. is not the | |
// best way to do it; but it saves some time and does not overwhelm servers | |
// | |
// examples for data scraping :D | |
// a = document.querySelectorAll('ul#list>li>a.folder') | |
// JSON.stringify(Array.prototype.map.call(a, i => decodeURI(i.href.split('/').reverse()[1]))) | |
// | |
// ... and files from inside folders: | |
// a = document.querySelectorAll('ul#list>li[class~="file"]>a') | |
// JSON.stringify(Array.prototype.map.call(a, i => i.href.split('?a')[0])) | |
const fs = require('fs') | |
const path = require('path') | |
const qs = require('querystring') | |
const ht = require('https') | |
const data = require('./dl.json') // JSON.parse( js.readFileSync('dl.json') ) | |
// create epoch secs.msec [xxx.y] | |
const get_time_frsec = () => roundfr(Date.now()/1000) | |
const roundfr = x => Math.round(x*10) / 10 | |
// return filename part only from url path | |
const get_filename = x => qs.unescape(x).split('/').reverse()[0] | |
const get_script = function (url, name) { | |
return new Promise((resolve, reject) => | |
ht.get(url, resp => { | |
//let data = '' | |
//resp.on('data', chunk => data += chunk) | |
//resp.on('end', _ => { | |
// fs.writeFileSync(name, data, 'utf-8') // utf8 or binary encoding | |
// resolve() | |
//}) | |
let tm = get_time_frsec() | |
resp.pipe(fs.createWriteStream(name, 'binary')) // streams to allow low mem. usage | |
resp.on('end', _ => resolve({name, time: get_time_frsec()-tm})) | |
resp.on('error', _ => reject(_)) | |
}) | |
) | |
} | |
// so await could be used to grab files sequentially | |
async function dofiles(dirname, files) { | |
// check for prev. downloaded files; no hash||size check [TODO, maybe] | |
let missing_files = []; | |
[...files].forEach(file => { | |
if(!fs.existsSync(path.join(dirname, get_filename(file)))) | |
missing_files.push(file) // just mark it to be removed | |
}) | |
if(missing_files.length < files.length) { | |
console.log(` > ...skipped ${files.length-missing_files.length} already existing files`) | |
files = missing_files | |
} | |
let [files_cnt, files_now] = [files.length, 0] | |
let parallel_queue = [] | |
while(files.length) { | |
let file = files.shift().replace('#', '%23') // needed for some servers -- can be used with qs | |
let file_name = get_filename(file) | |
files_now += 1 | |
let files_progress = files_now.toString() | |
.padStart(files_cnt.toString().length, '0') | |
process.stdout.write(` ${files_progress}/${files_cnt} '${file_name}' \n`) | |
// fill list of files to dl parallelly | |
if(parallel_queue.length < threads) | |
parallel_queue.push( get_script(file, path.join(dirname, file_name)) ) | |
// do processing only if last file or parallel queue full | |
if(files.length === 0 || parallel_queue.length === threads) { | |
let data = await Promise.all(parallel_queue).catch(err => console.error(err)) | |
process.stdout.moveCursor(0, 0-parallel_queue.length) | |
data.map(item => { | |
let {name, time} = item | |
let stat = fs.statSync(name) | |
process.stdout.moveCursor(68) | |
process.stdout.write(` [${roundfr(stat.size/1024/1024)}mb` | |
+` @ ${roundfr(time)}s]\n`) | |
}) | |
parallel_queue = [] // reset queue | |
} | |
} | |
} | |
// go sequentially first folders then (up) files | |
const doitall = async (data) => { | |
for (let [dirname, files] of Object.entries(data)) { | |
if(!fs.existsSync(dirname)) { | |
fs.mkdirSync(dirname) | |
} | |
console.log(`>>> Getting "${dirname}"`) | |
await dofiles(dirname, files) | |
.then(_ => console.log(` --------`)) | |
.catch(_ => console.log(`!!! ERROR !!! ${_}`)) | |
} | |
} | |
console.log('>>> JS^2ON-file grabber v0.057 🚄 alpha by iexa\n') | |
// 1st and only param n of `threads` (async `procs`) def. 2. | |
let threads = parseInt(process.argv[2] || 2) | |
threads = Number.isNaN(threads) || threads < 1 ? 2 : threads | |
const time_start = get_time_frsec() | |
let results_promise | |
if(data instanceof Array) // [files only] | |
results_promise = dofiles('.', data) | |
else // {subfolder: [files], subfolder2: [files2]} | |
results_promise = doitall(data) | |
results_promise.then(_ => console.log( | |
`>>> DONE. Took ${roundfr(get_time_frsec()-time_start)} secs overall.\n`)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment