Created
January 30, 2011 04:53
-
-
Save guybrush/802552 to your computer and use it in GitHub Desktop.
testing performance of parsing xml/html with nodejs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// this is about parsing-performance of xml/html-parsers | |
// to be more precisely, i just want to look for specific nodes/attributes | |
// | |
// i am testing | |
// | |
// * (htmlparser)[https://github.com/tautologistics/node-htmlparser] | |
// * (html5)[https://github.com/aredridel/html5] | |
// * (sax)[https://github.com/isaacs/sax-js] | |
// * (jsdom)[https://github.com/tmpvar/jsdom] + sizzle/jquery | |
// | |
// the output i get is: | |
// | |
// htmlparser done in 73 ms - memory: 2.09375 mb RSS - found 200 items | |
// html5 done in 1727 ms - memory: 15.60546875 mb RSS - found 0 items (TODO) | |
// sax done in 100 ms - memory: 0.0390625 mb RSS - found 200 items | |
// jsdom/jquery done in 728 ms - memory: 0.1171875 mb RSS - found 200 items | |
var Seq = require('seq') | |
, Step = require('step') | |
, events = require('events') | |
, request = require('request') | |
, saxLib = require('sax') | |
, html = require('htmlparser') | |
, html5 = require('html5') | |
, jsdom = require('jsdom') | |
, uri = 'http://twitter.com/statuses/user_timeline/18975861.rss' | |
, lookfor = 'item' | |
, n = 10 | |
//------------------------------------------------ request xml and test all | |
request({uri:uri}, function(err, res, body) { | |
Seq() | |
.seq(function(){testHtml(body, this)}) | |
.seq(function(){testHtml5(body, this)}) | |
.seq(function(){testSax(body, this)}) | |
.seq(function(){testJsdom(body, this)}) | |
.seq(function(){console.log('all done')}) | |
}) | |
//------------------------------------------------ tautologistic's htmlparser | |
function testHtml(body, cb) { | |
var htmlHandler, htmlParser | |
, t = Date.now() | |
, m = process.memoryUsage().rss | |
, done = 0 | |
, items = 0 | |
htmlHandler = new html.DefaultHandler(function(err, dom) { | |
function walkDom(dom) { | |
for (var i=0, len=dom.length; i<len; i++) { | |
if (dom[i].type == 'tag' && dom[i].name == lookfor) items++ | |
if (dom[i].children && dom[i].children.length) | |
walkDom(dom[i].children) | |
} | |
} | |
walkDom(dom) | |
if (++done==n) { | |
console.log | |
( 'htmlparser done in %s ms - memory: %s mb RSS - found %s items' | |
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items ) | |
cb() | |
} else { | |
htmlParser.parseComplete(body) | |
} | |
}) | |
htmlParser = new html.Parser(htmlHandler) | |
htmlParser.parseComplete(body) | |
} | |
//------------------------------------------------ aredridel's html5 | |
function testHtml5(body, cb) { | |
var parser = new html5.Parser() | |
, em = new events.EventEmitter() | |
, t = Date.now() | |
, m = process.memoryUsage().rss | |
, done = 0 | |
, items = 0 | |
parser.on('done', function() { // i guess this is not done yet? altough its in the doc | |
console.log('html5-done') | |
}) | |
parser.parse(em) | |
while(++done <= 10) { | |
em.emit('data', body) | |
em.emit('end') | |
} | |
console.log | |
( 'html5 done in %s ms - memory: %s mb RSS - found %s items (TODO)' | |
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items ) | |
cb() | |
} | |
//------------------------------------------------ isaac's sax-parser | |
function testSax(body, cb) { | |
var sax = saxLib.parser(true) | |
, t = Date.now() | |
, m = process.memoryUsage().rss | |
, done = 0 | |
, items = 0 | |
sax.onerror = function(err) {console.log(err)} | |
sax.onopentag = function(node) { | |
if (node.name == lookfor) items++ | |
} | |
sax.onend = function() { | |
if (++done==n) { | |
console.log | |
( 'sax done in %s ms - memory: %s mb RSS - found %s items' | |
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items ) | |
cb() | |
} | |
} | |
while (done<n) sax.write(body).close() | |
} | |
//------------------------------------------------ tmpvar's jsdom + sizzle/jquery | |
function testJsdom(body, cb) { | |
var window = jsdom.jsdom().createWindow() | |
jsdom.jQueryify( window | |
, 'http://code.jquery.com/jquery.min.js' | |
, function() { | |
var done = 0 | |
, t = Date.now() | |
, m = process.memoryUsage().rss | |
, items = 0 | |
while(++done <= n) { | |
window.$('body').html(body) | |
items += window.$(lookfor).length | |
} | |
console.log | |
( 'jsdom/jquery done in %s ms - memory: %s mb RSS - found %s items' | |
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items ) | |
cb() | |
}) | |
} | |
with HEAD (6019785d) i get
sax done in 117 ms - memory: 1.96484375 mb RSS
htmlparser done in 47 ms - memory: 0.5703125 mb RSS
jsdom/jquery done in 924 ms - memory: 5.82421875 mb RSS
good job! :D
anyway this test/code is very incomplete and may not be worth anything - just wanted to make a snapshot
cool, looks like I managed to chop off 1/3 of your execution time! I cant went to spend some real time optimizing.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
which version of jsdom are you using? I would try HEAD