Data Set
ted_lsp_latency_2.xml: 426M with 3,187,102 tags
xmllint ~ Desktop time xmllint -noout ted_lsp_latency_2.xml
real 0m13.399s user 0m11.707s sys 0m1.668s ~ Desktop time xmllint -noout ted_lsp_latency_2.xml
real 0m13.252s user 0m11.628s sys 0m1.613s
- libxml2 core but C++ client
PyRXP
~ Desktop python benchmarks.py Interactive benchmark suite for Python XML tree-parsers. Using sample XML file 436848630 bytes long Parsers available: 1. pyRXP 2. pyRXP_nonvalidating 3. rparsexml 4. expat 5. minidom 6. msxml30 7. 4dom 8. cdomlette
Parser number (or x to exit) > 1 Shall we do memory tests? i.e. you look at Task Manager? y/n > n testing None counted 13591538 tags, 3187102 attributes pyRXP: init 0.0013, parse 21.3672, traverse 3.3879
~ Desktop python benchmarks.py Interactive benchmark suite for Python XML tree-parsers. Using sample XML file 436848630 bytes long Parsers available: 1. pyRXP 2. pyRXP_nonvalidating 3. rparsexml 4. expat 5. minidom 6. msxml30 7. 4dom 8. cdomlette
Parser number (or x to exit) > 2 Shall we do memory tests? i.e. you look at Task Manager? y/n > n testing None counted 13591538 tags, 3187102 attributes pyRXP_nonvalidating: init 0.0015, parse 21.1104, traverse 3.4008
ElementTree - parse
import xml.etree.ElementTree
with open("ted_lsp_latency_2.xml", "rt") as f:
tree = xml.etree.ElementTree.parse(f)
~ Desktop time python t.py
real 0m21.379s
user 0m19.444s
sys 0m1.770s
~ Desktop time python t.py
real 0m19.878s
user 0m18.408s
sys 0m1.437s
ElementTree - iterparse
import xml.etree.ElementTree
counter = 0
with open("ted_lsp_latency_2.xml", "rt") as f:
for event, elem in xml.etree.ElementTree.iterparse(f, events=('start',)):
if event == 'start':
counter = counter + 1
elem.clear()
print(counter)
~ Desktop time python t.py
real 0m14.852s
user 0m14.635s
sys 0m0.199s
~ Desktop time python t.py
real 0m14.948s
user 0m14.717s
sys 0m0.207s
~ Desktop time python t.py
3187102
real 0m15.745s
user 0m15.481s
sys 0m0.231s
~ Desktop time python t.py
3187102
real 0m15.461s
user 0m15.239s
sys 0m0.208s
lXML - iterparse
import lxml.etree
counter = 0
with open("ted_lsp_latency_2.xml", "rb") as f:
for event, elem in lxml.etree.iterparse(f, events=('end',)):
if event == 'end':
counter = counter + 1
elem.clear()
print(counter)
~ Desktop time python t.py
3187102
q
real 0m14.226s
user 0m13.938s
sys 0m0.227s
~ Desktop time python t.py
3187102
real 0m14.623s
user 0m14.354s
sys 0m0.232s
Expat
import xml.parsers.expat
parser = xml.parsers.expat.ParserCreate()
counter = 0
def end_element(name):
global counter
counter = counter + 1
parser.EndElementHandler = end_element
with open("ted_lsp_latency_2.xml", "rb") as f:
parser.ParseFile(f)
print(counter)
~ Desktop time python t.py 3187102
real 0m5.169s user 0m4.924s sys 0m0.203s ~ Desktop time python t.py 3187102
real 0m5.169s user 0m4.961s sys 0m0.197s ~ Desktop time python t.py 3187102
real 0m5.152s user 0m4.951s sys 0m0.190s
PugiXML
import pugi
doc = pugi.xml_document()
res = doc.load_file("ted_lsp_latency_2.xml")
print(res.status)
root = doc.root()
counter = 0
def count_things(node):
global counter
node_iterator = node.first_child()
while not node_iterator.empty():
counter = counter + 1
count_things(node_iterator)
node_iterator = node_iterator.next_sibling()
count_things(root)
print(counter)
~ Desktop time python t.py 0 3193664
real 0m10.622s user 0m10.026s sys 0m0.575s ~ Desktop time python t.py 0 3193664
real 0m10.304s user 0m9.708s sys 0m0.579s ~ Desktop time python t.py 0 3193664
real 0m10.349s user 0m9.756s sys 0m0.577s
sxd_document
//extern crate roxmltree;
extern crate sxd_document;
use std::fs;
use std::io::Read;
use sxd_document::parser;
fn main() -> std::io::Result<()> {
let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
let mut text = String::new();
file.read_to_string(&mut text).unwrap();
println!("Completed reading file");
//let doc = roxmltree::Document::parse(&text).unwrap();
let package = parser::parse(&text).unwrap();
let doc = package.as_document();
println!("Completed parsing file");
//let counter = 0;
//for node in doc.descendants() {
// if node.is_element() {
// counter = counter + 1;
// }
//}
//println!("Found {} elements", doc.descendants().count());
println!("Found {} elements", doc.root().children().len());
return std::result::Result::Ok(());
}
master ~ Desktop foo time ./target/release/foo
Completed reading file
Completed parsing file
Found 1 elements
real 0m31.388s
user 0m27.247s
sys 0m3.722s
master ~ Desktop foo time ./target/release/foo
Completed reading file
Completed parsing file
Found 1 elements
real 0m28.963s
user 0m25.613s
sys 0m3.243s
roxmltree
extern crate roxmltree;
//extern crate sxd_document;
use std::fs;
use std::io::Read;
//use sxd_document::parser;
use roxmltree::*;
fn main() -> std::io::Result<()> {
let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
let mut text = String::new();
file.read_to_string(&mut text).unwrap();
println!("Completed reading file");
//let doc = roxmltree::Document::parse(&text).unwrap();
let doc = Document::parse(&text).unwrap();
println!("Completed parsing file");
//let counter = 0;
//for node in doc.descendants() {
// if node.is_element() {
// counter = counter + 1;
// }
//}
//println!("Found {} elements", doc.descendants().count());
//println!("Found {} elements", doc.root_element().children().len());
return std::result::Result::Ok(());
}
master ~ Desktop foo time ./target/release/foo
Completed reading file
Completed parsing file
real 0m7.386s
user 0m5.932s
sys 0m1.434s
master ~ Desktop foo time ./target/release/foo
Completed reading file
Completed parsing file
real 0m7.223s
user 0m5.812s
sys 0m1.399s
quick-xml
//extern crate roxmltree;
//extern crate sxd_document;
extern crate quick_xml;
use std::fs;
use std::io::Read;
//use sxd_document::parser;
//use roxmltree::*;
use quick_xml::Reader;
use quick_xml::events::Event;
fn main() -> std::io::Result<()> {
let mut file = fs::File::open("../ted_lsp_latency_2.xml").unwrap();
let mut text = String::new();
file.read_to_string(&mut text).unwrap();
println!("Completed reading file");
//let doc = roxmltree::Document::parse(&text).unwrap();
let mut reader = Reader::from_str(&text);
let mut count = 0;
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(_)) => count += 1,
Ok(Event::Decl(e)) => println!("{:?}", e.version()),
Ok(Event::Eof) => break,
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
_ => (),
}
buf.clear();
}
println!("Found {} elements", count);
println!("Completed parsing file");
return std::result::Result::Ok(());
}
master ~ Desktop foo time ./target/release/foo
Completed reading file
Ok([49, 46, 48])
Found 921485 elements
Completed parsing file
real 0m0.966s
user 0m0.684s
sys 0m0.278s
master ~ Desktop foo time ./target/release/foo
Completed reading file
Ok([49, 46, 48])
Found 921485 elements
Completed parsing file
real 0m0.989s
user 0m0.695s
sys 0m0.291s
—> lower element count?
GoLang builtin
package main
import (
"encoding/xml"
"io/ioutil"
)
type Graphml struct {
XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns graphml"`
Graph Graph `xml:"http://graphml.graphdrawing.org/xmlns graph"`
Keys []Key `xml:"http://graphml.graphdrawing.org/xmlns key"`
}
type Graph struct {
XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns graph"`
Nodes []Node `xml:"http://graphml.graphdrawing.org/xmlns node"`
Edges []Edge `xml:"http://graphml.graphdrawing.org/xmlns edge"`
}
type Key struct {
XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns key"`
AttributeName string `xml:"attr.name,attr"`
AttributeType string `xml:"attr.type,attr"`
For string `xml:"for,attr"`
Id string `xml:"id,attr"`
}
type Node struct {
XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns node"`
Type string `xml:"type,attr"`
Id string `xml:"id,attr"`
IdTs int32 `xml:"id.ts,attr"`
Data Tag `xml:"http://graphml.graphdrawing.org/xmlns data"`
}
type Edge struct {
XMLName xml.Name `xml:"http://graphml.graphdrawing.org/xmlns edge"`
Type string `xml:"type,attr"`
Id string `xml:"id,attr"`
Source string `xml:"source,attr"`
SourceTs int32 `xml:"source.ts,attr"`
Target string `xml:"target,attr"`
TargetTs int32 `xml:"target.ts,attr"`
Data Tag `xml:"http://graphml.graphdrawing.org/xmlns data"`
}
type Tag struct {
XMLName xml.Name
Content string `xml:",innerxml"`
}
func check(e error) {
if e != nil {
panic(e)
}
}
func main() {
dat, err := ioutil.ReadFile("ted_lsp_latency_2.xml")
check(err)
var f *Graphml
err2 := xml.Unmarshal(dat, &f)
check(err2)
//fmt.Println("len:", f.XMLName)
//fmt.Println("len:", len(f.Keys))
//fmt.Println("len:", len(f.Graph.Edges))
//fmt.Println("len:", len(f.Graph.Nodes))
}
~ Desktop time ./test
real 0m18.873s
user 0m18.830s
sys 0m0.683s
~ Desktop time ./test
real 0m19.261s
user 0m19.077s
sys 0m0.690s
~ Desktop time ./test
real 0m18.907s
user 0m18.780s
sys 0m0.692s
Java
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SaxParser {
public static class MyHandler extends DefaultHandler {
int count = 0;
@Override
public void startElement(final String uri, final String localName, final String qName,
final Attributes attributes) throws SAXException {
this.count++;
}
public int getCount() {
return this.count;
}
}
public static void main(final String argv[]) {
try {
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser saxParser = factory.newSAXParser();
final MyHandler handler = new MyHandler();
saxParser.parse("ted_lsp_latency_2.xml", handler);
System.out.println(handler.getCount());
} catch (final Exception e) {
e.printStackTrace();
}
}
}
~ Desktop time java SaxParser 3187102
real 0m3.621s user 0m3.834s sys 0m0.225s ~ Desktop time java SaxParser 3187102
real 0m3.437s user 0m3.787s sys 0m0.207s ~ Desktop time java SaxParser 3187102
real 0m3.450s user 0m3.813s sys 0m0.209s