Created
April 4, 2011 12:52
-
-
Save ryuone/901580 to your computer and use it in GitHub Desktop.
Node.js program. parseHTML and get Image files to save it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* node getImages.js http://www.yahoo.co.jp */ | |
var htmlparser = require('htmlparser'); | |
var sys = require('sys'); | |
var http = require('http'); | |
var fs = require('fs'); | |
var url = require('url'); | |
var path = require('path'); | |
if(process.argv.length !== 3){ | |
console.log('arguments error.'); | |
return; | |
} | |
var parsedUrl = url.parse(process.argv[2]); | |
if(parsedUrl.host == null){ | |
console.log('URL parsed Error.(' + process.argv[2] + ')'); | |
return; | |
} | |
var httpClient = http.createClient(parsedUrl.port ? parsedUrl.port : 80, parsedUrl.hostname); | |
var request = httpClient.request('GET', parsedUrl.pathname, parsedUrl); | |
request.end(); | |
var html = []; | |
request.on('response', function(res){ | |
console.log("->Start"); | |
console.log('STATUS : ' + res.statusCode); | |
console.log('HEADER : ' + JSON.stringify(res.headers)); | |
res.setEncoding('utf8'); | |
res.on('data', function(chunk){ | |
html.push(chunk); | |
}); | |
res.on('end', function(){ | |
console.log("->End"); | |
var myhtmlparser = new MyHtmlParser(parsedUrl); | |
myhtmlparser.parse(html.join('')); | |
myhtmlparser.getImageElements({tagName:'img'}); | |
// myhtmlparser.getImageElements({tagName:'a'}); | |
for(var i=0,imax=myhtmlparser.urls.length; i<imax; i++){ | |
console.log(myhtmlparser.urls[i]); | |
myhtmlparser.getImage(url.parse(myhtmlparser.urls[i]), writeImageFile); | |
} | |
}); | |
}); | |
function writeImageFile(imageData, fname){ | |
var imagedir = 'imagedir'; | |
// var writeStream = fs.createWriteStream(fname, {flags:'w', encoding:null, mode:066}); | |
// writeStream.on('open',function(){ | |
// console.log('writeStream : open'); | |
// console.log(arguments); | |
// writeStream.write(imageData, 'binary'); | |
// // sys.pump(imageData, writeStream); | |
// writeStream.end(); | |
// }); | |
// writeStream.on('close',function(){ | |
// console.log('writeStream : close'); | |
// }); | |
fs.stat(imagedir, function(error, stat){ | |
if(error){ | |
fs.mkdirSync(imagedir, 0766); | |
} | |
fs.writeFile(imagedir + '/' + fname, imageData, 'binary', function(error){ | |
if(error){ | |
console.log(error); | |
} | |
}); | |
}); | |
} | |
/* -------------------------------------------------------- */ | |
var MyHtmlParser = function(parsedUrl){ | |
this.parsedUrl = parsedUrl; | |
this.parsedResult = null; | |
this.urls = []; | |
if (this === global){ | |
throw new Error("Error this is global."); | |
} | |
this.handler = new htmlparser.DefaultHandler(function (error, dom) { | |
if(error){ | |
sys.debug("Error : " + error); | |
} | |
}); | |
return this; | |
}; | |
MyHtmlParser.prototype.parse = function(html){ | |
var parser = new htmlparser.Parser(this.handler); | |
parser.parseComplete(html); | |
this.parsedResult = this.handler.dom; | |
}; | |
MyHtmlParser.prototype.getImageElements = function(option){ | |
option = option == null ? { tagName : 'img' } : option; | |
var tags = htmlparser.DomUtils.getElements({ tag_name: function(val){ | |
return val.toLowerCase() === option.tagName ? true : false; | |
}}, this.parsedResult); | |
var url = null; | |
for(var i=1,imax=tags.length; i<imax; i++){ | |
url = tags[i].attribs && (tags[i].attribs.href || tags[i].attribs.src); | |
if(url){ | |
if(url.match(/.*\.(jpg)$/i)){ | |
if(url.match(/^http:\/\/.*/)){ | |
this.urls.push(url); | |
}else{ | |
this.urls.push(this.parsedUrl.protocol + '//' + | |
this.parsedUrl.hostname + | |
url); | |
} | |
} | |
} | |
} | |
}; | |
MyHtmlParser.prototype.getImage = function(parsedUrl, callback){ | |
// var body = ""; | |
var body = []; | |
var getImageBind = this.getImage.bind(this); | |
http.get({host:parsedUrl.host, path:parsedUrl.pathname},function(res){ | |
res.setEncoding('binary'); | |
res.on('data', function(chunk){ | |
// body += chunk; | |
body.push(chunk); | |
}); | |
res.on('end', function(){ | |
if(res.statusCode === 302){ | |
body = getImageBind(url.parse(res.headers.location), callback); | |
}else if(res.statusCode === 200){ | |
callback(body.join(''), parsedUrl.pathname.match(".+/(.+?)$")[1]); | |
// callback(body, parsedUrl.pathname.match(".+/(.+?)$")[1]); | |
}else{ | |
console.log(' -> ' + res.statusCode); | |
} | |
}); | |
}); | |
// console.log('finish->'); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment