Skip to content

Instantly share code, notes, and snippets.

@googya
Last active December 23, 2015 08:09
Show Gist options
  • Select an option

  • Save googya/6605852 to your computer and use it in GitHub Desktop.

Select an option

Save googya/6605852 to your computer and use it in GitHub Desktop.
download http://www.oracle.com/technetwork/java/javase/community/jvmls2013-2013900.html video and pdf, using underscore request jquery async exec etc..
var _u = require('underscore')
var request = require('request')
var $ = require('jquery')
var fs = require('fs');
var async = require('async');
var _url = require('url');
var http = require('http')
var exec = require('child_process').exec;
var spawn = require('child_process').spawn;
var url = "http://www.oracle.com/technetwork/java/javase/community/jvmls2013-2013900.html"
var outter_host = _url.parse(url).host
function get_html(callback) {
request.get(url, function(err, res, body){
callback(null, body);
});
}
function parse_html(body, type, callback){
var _urls = [];
var sss = $(body).find(".dataTable a[href*='." + type + "']");
_urls = _u.map(sss, function(e) {
var host = _url.parse(e.href).host
var path = _url.parse(e.href).path
console.log(" host is " + host + " typeof host " + typeof host);
// if(host != null || host != "") return e.href;
// else return outter_host + "/" + path;
if(_u.isEmpty(host)){
return "http://" + outter_host + path
}
return e.href
});
callback(null, _urls);
}
var DOWNLOAD_DIR = './'
function download_all_files(err, urls){
console.log(" urls is " + urls)
_u.each(urls, download_file_raw)
}
function download_file_wget(file_url){
var wget = 'wget -c -P ' + DOWNLOAD_DIR + ' ' + file_url
var child = exec(wget, {maxBuffer: 20000*1024 }, function(err, stdout, stderr){
if (err) throw err;
else
console.log(file_url + ' download to ' + DOWNLOAD_DIR);
});
}
function download_file_raw(file_url) {
var file_name = _url.parse(file_url).pathname.split('/').pop();
var file = fs.createWriteStream(DOWNLOAD_DIR + file_name);
var options = {
host: _url.parse(file_url).host,
port: 80,
path: _url.parse(file_url).pathname
};
http.get(options, function(res){
res.on('data', function(data) {
file.write(data);
}).on('end', function () {
file.end();
console.log(file_name + ' download to ' + DOWNLOAD_DIR)
})
});
}
async.waterfall([
function(callback){
get_html(callback);
},
function(body, callback){
parse_html(body, "mov", callback)
}], download_all_files);
@googya
Copy link
Copy Markdown
Author

googya commented Sep 18, 2013

when using exec , you should set maxBuffer if necessary

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment