Skip to content

Instantly share code, notes, and snippets.

@02strich
Created March 28, 2020 15:52
Show Gist options
  • Save 02strich/cb057750abe1bdb1a2cdea2ec988d4bd to your computer and use it in GitHub Desktop.
Save 02strich/cb057750abe1bdb1a2cdea2ec988d4bd to your computer and use it in GitHub Desktop.
XML Parsing - a performance study
# benchmark
# MSXML: This can be downloaded from many places. You need 3.0
# which is NOT in most newly installed Windows boxes. (650kb)
# http://download.microsoft.com/download/xml/Install/3.0/WIN98Me/EN-US/msxml3.exe
# for a quick tutorial on MSXML 3.0, see
# http://www.perfectxml.com/articles/xml/msxml30.asp
# you should then run the COM MakePY utility on the Pythonwin menu.
# to get it going as fast as possible.
import sys
import glob
import time
import string
import io
def tupleTreeStats(node):
# counts tags and attributes recursively
# use for all reportlab parsers
if node[1] is None:
attrCount = 0
else:
attrCount = len(node[1])
nodeCount = 1
if node[2] is not None:
for child in node[2]:
if isinstance(child, (tuple, list)):
a, n = tupleTreeStats(child)
attrCount = attrCount + a
nodeCount = nodeCount + n
return attrCount, nodeCount
### pyRXP - our wrapper around Univ of Edinburgh
def getPyRXPParser():
import pyRXPU
p = pyRXPU.Parser()
return p
def getNonValidatingPyRXPParser():
import pyRXPU
p = pyRXPU.Parser(Validate=0)
return p
def parseWithPyRXP(parser, rawdata):
return parser.parse(rawdata)
### rparsexml - Aaron's very fast pure python parser
def loadRparseXML():
#it's a module, what the heck
from reportlab.lib import rparsexml
return rparsexml
def parseWithRParseXML(rparsexml, rawdata):
#first argument is a dummy holding none
return rparsexml.parsexml0(rawdata)[0]
### expattree - tree-building wrapper around pyexpat
def getExpatParser():
import expattree
return expattree.ExpatTreeParser()
def parseWithExpat(expatParser, rawdata):
#first argument is a dummy holding none
return expatParser.parse(rawdata)
####### minidom - non-validating DOM parser in the Python distro
def loadMiniDOM():
import xml.dom.minidom
return xml.dom.minidom
def parseWithMiniDOM(dom_module, rawdata):
#parser is None
return dom_module.parseString(rawdata)
def statsWithMiniDOM(node):
return (1, 0)
######### Microsoft XML Parser via COM ######################
def loadMSXML30():
from win32com.client import Dispatch
msx = Dispatch('Microsoft.XMLDOM')
return msx
def parseWithMSXML30(msx, rawdata):
msx.loadXML(rawdata)
return msx
def statsWithMSXML30(node):
#not done
return (1,0)
###########4DOM ###############
def load4DOM():
from xml.dom.ext.reader import PyExpat
from xml.dom import Node
reader = PyExpat.Reader()
return reader
def parseWith4DOM(reader, rawdata):
return reader.fromString(rawdata)
def statsWith4DOM(node):
#node
return (1,0)
def loadCDomlette():
from Ft.Lib import cDomlettec
return cDomlettec
def parseWithCDomlette(modul, rawdata):
io = io.StringIO(rawdata)
return modul.parse(io, '')
def statsWithCDomlette(node):
#node
return (1,0)
##########put them all together################
TESTMAP = [
# name of parser; function to initialize if needed;
# function to parse; function to do stats
('pyRXP', getPyRXPParser, parseWithPyRXP, tupleTreeStats),
('pyRXP_nonvalidating', getNonValidatingPyRXPParser, parseWithPyRXP, tupleTreeStats),
('rparsexml', loadRparseXML, parseWithRParseXML, tupleTreeStats),
('expat', getExpatParser, parseWithExpat, tupleTreeStats),
('minidom', loadMiniDOM, parseWithMiniDOM, statsWithMiniDOM),
('msxml30', loadMSXML30, parseWithMSXML30, statsWithMSXML30),
('4dom', load4DOM, parseWith4DOM, statsWith4DOM),
('cdomlette', loadCDomlette, parseWithCDomlette, statsWithCDomlette)
]
def interact(testName=None, dtd=1, pause='unknown'):
# if no DTD requested, trim off first 2 lines; the lack of
# a DTD reference will put validating parsers into non-
# validating mode
sampleText = open('ted_lsp_latency_2.xml', 'rb').read()
if testName:
found = 0
for row in TESTMAP:
if row[0] == testName:
found = 1
(name, loadFunc, parseFunc, statFunc) = row
break
if not found:
print('parser %s not found, please select' % testName)
if not testName:
# interactive, show stuff
print("Interactive benchmark suite for Python XML tree-parsers.")
print('Using sample XML file %d bytes long' % len(sampleText))
print("Parsers available:")
i = 1
for (name, a, b, c) in TESTMAP:
print('\t%d. %s' % (i, name))
i = i + 1
print()
inp = input('Parser number (or x to exit) > ')
if inp == 'x':
print('bye')
return
else:
num = int(inp)
(name, loadFunc, parseFunc, statFunc) = TESTMAP[num-1]
# force pause to 1 or 0 by asking
if pause == 'unknown':
inp = input("Shall we do memory tests? i.e. you look at Task Manager? y/n > ")
assert inp in 'yn', 'enter "y" or "n". Please run again!'
pause = (inp == 'y')
print('testing %s' % testName)
#load the parser
t0 = time.clock()
parser = loadFunc()
loadTime = time.clock() - t0
if pause:
baseMem = float(input("Pre-parsing: please input python process memory in kb > "))
t1 = time.clock()
parsedOutput = parseFunc(parser, sampleText)
t2 = time.clock()
parseTime = t2 - t1
if pause:
totalMem = float(input('Post-parsing: please input python process memory in kb > '))
usedMem = totalMem - baseMem
memFactor = usedMem * 1024.0 / len(sampleText)
t3 = time.clock()
n, a = statFunc(parsedOutput)
t4 = time.clock()
traverseTime = t4 - t3
print('counted %d tags, %d attributes' % (n, a))
if pause:
print('%s: init %0.4f, parse %0.4f, traverse %0.4f, mem used %dkb, mem factor %0.2f' % (
name, loadTime, parseTime, traverseTime, usedMem, memFactor))
else:
print('%s: init %0.4f, parse %0.4f, traverse %0.4f' % (
name, loadTime, parseTime, traverseTime))
print()
if __name__=='__main__':
import sys
args = sys.argv[:]
if '-nodtd' in args:
dtd=0
args.remove('-nodtd')
else:
dtd=1
if '-pause' in args:
pause = 1
args.remove('-pause')
elif '-nopause' in args:
pause = 0
args.remove('-nopause')
else:
pause = 'unknown' # it will ask
if len(args) > 1:
testName = args[1]
else:
testName = None
interact(testName, dtd, pause=pause)

Data Set

ted_lsp_latency_2.xml: 426M with 3,187,102 tags

xmllint ~  Desktop  time xmllint -noout ted_lsp_latency_2.xml

real 0m13.399s user 0m11.707s sys 0m1.668s  ~  Desktop  time xmllint -noout ted_lsp_latency_2.xml

real 0m13.252s user 0m11.628s sys 0m1.613s

  • libxml2 core but C++ client

PyRXP

 ~  Desktop  python benchmarks.py Interactive benchmark suite for Python XML tree-parsers. Using sample XML file 436848630 bytes long Parsers available: 1. pyRXP 2. pyRXP_nonvalidating 3. rparsexml 4. expat 5. minidom 6. msxml30 7. 4dom 8. cdomlette

Parser number (or x to exit) > 1 Shall we do memory tests? i.e. you look at Task Manager? y/n > n testing None counted 13591538 tags, 3187102 attributes pyRXP: init 0.0013, parse 21.3672, traverse 3.3879

 ~  Desktop  python benchmarks.py Interactive benchmark suite for Python XML tree-parsers. Using sample XML file 436848630 bytes long Parsers available: 1. pyRXP 2. pyRXP_nonvalidating 3. rparsexml 4. expat 5. minidom 6. msxml30 7. 4dom 8. cdomlette

Parser number (or x to exit) > 2 Shall we do memory tests? i.e. you look at Task Manager? y/n > n testing None counted 13591538 tags, 3187102 attributes pyRXP_nonvalidating: init 0.0015, parse 21.1104, traverse 3.4008

ElementTree - parse

import xml.etree.ElementTree
with open("ted_lsp_latency_2.xml", "rt") as f:
    tree = xml.etree.ElementTree.parse(f)
 ~  Desktop  time python t.py

real    0m21.379s
user    0m19.444s
sys     0m1.770s
 ~  Desktop  time python t.py

real    0m19.878s
user    0m18.408s
sys     0m1.437s

ElementTree - iterparse

import xml.etree.ElementTree

counter = 0
with open("ted_lsp_latency_2.xml", "rt") as f:
    for event, elem in xml.etree.ElementTree.iterparse(f, events=('start',)):
        if event == 'start':
            counter = counter + 1
            elem.clear()
print(counter)
 ~  Desktop  time python t.py

real    0m14.852s
user    0m14.635s
sys     0m0.199s
 ~  Desktop  time python t.py

real    0m14.948s
user    0m14.717s
sys     0m0.207s
 ~  Desktop  time python t.py
3187102

real    0m15.745s
user    0m15.481s
sys     0m0.231s
 ~  Desktop  time python t.py
3187102

real    0m15.461s
user    0m15.239s
sys     0m0.208s

lXML - iterparse

import lxml.etree

counter = 0
with open("ted_lsp_latency_2.xml", "rb") as f:
    for event, elem in lxml.etree.iterparse(f, events=('end',)):
        if event == 'end':
            counter = counter + 1
            elem.clear()
print(counter)
 ~  Desktop  time python t.py
3187102
q
real    0m14.226s
user    0m13.938s
sys     0m0.227s
 ~  Desktop  time python t.py
3187102

real    0m14.623s
user    0m14.354s
sys     0m0.232s

Expat

import xml.parsers.expat

parser = xml.parsers.expat.ParserCreate()

counter = 0

def end_element(name):
        global counter
        counter = counter + 1
parser.EndElementHandler = end_element

with open("ted_lsp_latency_2.xml", "rb") as f:
        parser.ParseFile(f)

print(counter)

 ~  Desktop  time python t.py 3187102

real 0m5.169s user 0m4.924s sys 0m0.203s  ~  Desktop  time python t.py 3187102

real 0m5.169s user 0m4.961s sys 0m0.197s  ~  Desktop  time python t.py 3187102

real 0m5.152s user 0m4.951s sys 0m0.190s

PugiXML

import pugi

doc = pugi.xml_document()
res = doc.load_file("ted_lsp_latency_2.xml")
print(res.status)

root = doc.root()
counter = 0
def count_things(node):
        global counter

        node_iterator = node.first_child()
        while not node_iterator.empty():
                counter = counter + 1
                count_things(node_iterator)
                node_iterator = node_iterator.next_sibling()
count_things(root)
print(counter)

 ~  Desktop  time python t.py 0 3193664

real 0m10.622s user 0m10.026s sys 0m0.575s  ~  Desktop  time python t.py 0 3193664

real 0m10.304s user 0m9.708s sys 0m0.579s  ~  Desktop  time python t.py 0 3193664

real 0m10.349s user 0m9.756s sys 0m0.577s

sxd_document

//extern crate roxmltree;
extern crate sxd_document;

use std::fs;
use std::io::Read;
use sxd_document::parser;

fn main() -> std::io::Result<()> {
    let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
    let mut text = String::new();
    file.read_to_string(&mut text).unwrap();
    println!("Completed reading file");
    
    //let doc = roxmltree::Document::parse(&text).unwrap();
    let package = parser::parse(&text).unwrap();
    let doc = package.as_document();
    println!("Completed parsing file");

    //let counter = 0;
    //for node in doc.descendants() {
    //    if node.is_element() {
    //        counter = counter + 1;
    //    }
    //}

    //println!("Found {} elements", doc.descendants().count());
    println!("Found {} elements", doc.root().children().len());
    return std::result::Result::Ok(());
}
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Completed parsing file
Found 1 elements

real    0m31.388s
user    0m27.247s
sys     0m3.722s
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Completed parsing file
Found 1 elements

real    0m28.963s
user    0m25.613s
sys     0m3.243s

roxmltree

extern crate roxmltree;
//extern crate sxd_document;

use std::fs;
use std::io::Read;
//use sxd_document::parser;
use roxmltree::*;

fn main() -> std::io::Result<()> {
    let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
    let mut text = String::new();
    file.read_to_string(&mut text).unwrap();
    println!("Completed reading file");
    
    //let doc = roxmltree::Document::parse(&text).unwrap();
    let doc = Document::parse(&text).unwrap();
    println!("Completed parsing file");

    //let counter = 0;
    //for node in doc.descendants() {
    //    if node.is_element() {
    //        counter = counter + 1;
    //    }
    //}

    //println!("Found {} elements", doc.descendants().count());
    //println!("Found {} elements", doc.root_element().children().len());
    return std::result::Result::Ok(());
}
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Completed parsing file

real    0m7.386s
user    0m5.932s
sys     0m1.434s
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Completed parsing file

real    0m7.223s
user    0m5.812s
sys     0m1.399s

quick-xml

//extern crate roxmltree;
//extern crate sxd_document;
extern crate quick_xml;

use std::fs;
use std::io::Read;
//use sxd_document::parser;
//use roxmltree::*;
use quick_xml::Reader;
use quick_xml::events::Event;

fn main() -> std::io::Result<()> {
    let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
    let mut text = String::new();
    file.read_to_string(&mut text).unwrap();
    println!("Completed reading file");
    
    //let doc = roxmltree::Document::parse(&text).unwrap();
    let mut reader = Reader::from_str(&text);
    let mut count = 0;
    let mut buf = Vec::new();
    loop {
        match reader.read_event(&mut buf) {
            Ok(Event::Start(_))  => count += 1,
            Ok(Event::Decl(e)) => println!("{:?}", e.version()),
            Ok(Event::Eof) => break,
            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
            _ => (),
        }

        buf.clear();
    }

    println!("Found {} elements", count);
    println!("Completed parsing file");

    return std::result::Result::Ok(());
}
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Ok([49, 46, 48])
Found 921485 elements
Completed parsing file

real    0m0.966s
user    0m0.684s
sys     0m0.278s
  master  ~  Desktop  foo  time ./target/release/foo
Completed reading file
Ok([49, 46, 48])
Found 921485 elements
Completed parsing file

real    0m0.989s
user    0m0.695s
sys     0m0.291s

—> lower element count?

GoLang builtin

package main

import (
    "encoding/xml"
    "io/ioutil"
)

type Graphml struct {
    XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns graphml"`
    Graph Graph      `xml:"http://graphml.graphdrawing.org/xmlns graph"`
    Keys []Key       `xml:"http://graphml.graphdrawing.org/xmlns key"`
}

type Graph struct {
    XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns graph"`
    Nodes []Node `xml:"http://graphml.graphdrawing.org/xmlns node"`
    Edges []Edge `xml:"http://graphml.graphdrawing.org/xmlns edge"`
}

type Key struct {
    XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns key"`
    AttributeName string `xml:"attr.name,attr"`
    AttributeType string `xml:"attr.type,attr"`
    For string `xml:"for,attr"`
    Id string `xml:"id,attr"`
}

type Node struct {
    XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns node"`
    Type string `xml:"type,attr"`
    Id string `xml:"id,attr"`
    IdTs int32 `xml:"id.ts,attr"`
    Data Tag `xml:"http://graphml.graphdrawing.org/xmlns data"`
}

type Edge struct {
    XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns edge"`
    Type string `xml:"type,attr"`
    Id string `xml:"id,attr"`
    Source string `xml:"source,attr"`
    SourceTs int32 `xml:"source.ts,attr"`
    Target string `xml:"target,attr"`
    TargetTs int32 `xml:"target.ts,attr"`
    Data Tag `xml:"http://graphml.graphdrawing.org/xmlns data"`
}

type Tag struct {
    XMLName xml.Name
    Content string `xml:",innerxml"`
}

func check(e error) {
    if e != nil {
        panic(e)
    }
}

func main() {
    dat, err := ioutil.ReadFile("ted_lsp_latency_2.xml")
    check(err)
    
    var f *Graphml
    err2 := xml.Unmarshal(dat, &f)
    check(err2)

    //fmt.Println("len:", f.XMLName)
    //fmt.Println("len:", len(f.Keys))
    //fmt.Println("len:", len(f.Graph.Edges))
    //fmt.Println("len:", len(f.Graph.Nodes))
}
 ~  Desktop  time ./test

real    0m18.873s
user    0m18.830s
sys     0m0.683s
 ~  Desktop  time ./test

real    0m19.261s
user    0m19.077s
sys     0m0.690s
 ~  Desktop  time ./test

real    0m18.907s
user    0m18.780s
sys     0m0.692s

Java

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


public class SaxParser {
	public static class MyHandler extends DefaultHandler {
		int count = 0;

		@Override
		public void startElement(final String uri, final String localName, final String qName,
				final Attributes attributes) throws SAXException {
			this.count++;
		}

		public int getCount() {
			return this.count;
		}
	}


	public static void main(final String argv[]) {
		try {
			final SAXParserFactory factory = SAXParserFactory.newInstance();
			final SAXParser saxParser = factory.newSAXParser();
			final MyHandler handler = new MyHandler();

			saxParser.parse("ted_lsp_latency_2.xml", handler);
			System.out.println(handler.getCount());
		} catch (final Exception e) {
			e.printStackTrace();
		}
	}
}

 ~  Desktop  time java SaxParser 3187102

real 0m3.621s user 0m3.834s sys 0m0.225s  ~  Desktop  time java SaxParser 3187102

real 0m3.437s user 0m3.787s sys 0m0.207s  ~  Desktop  time java SaxParser 3187102

real 0m3.450s user 0m3.813s sys 0m0.209s

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SaxParser {
public static class MyHandler extends DefaultHandler {
int count = 0;
@Override
public void startElement(final String uri, final String localName, final String qName,
final Attributes attributes) throws SAXException {
this.count++;
}
public int getCount() {
return this.count;
}
}
public static void main(final String argv[]) {
try {
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser saxParser = factory.newSAXParser();
final MyHandler handler = new MyHandler();
saxParser.parse("ted_lsp_latency_2.xml", handler);
System.out.println(handler.getCount());
} catch (final Exception e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment