Skip to content

Instantly share code, notes, and snippets.

@miku
Last active July 4, 2022 07:39
Show Gist options
  • Save miku/bb681be559d8103f053c93d4e51da2e8 to your computer and use it in GitHub Desktop.
Save miku/bb681be559d8103f053c93d4e51da2e8 to your computer and use it in GitHub Desktop.
Streaming through XML with a Python helper function

XML and large files

Python and stdlib

With Python standard lib only, xstream.py. Stream through a large file by looking, e.g. at one "record" at a time.

$ metha-sync https://bop.unibe.ch/baf/oai && \
    metha-cat https://bop.unibe.ch/baf/oai | python xstream.py

0       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
1       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
2       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
3       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
4       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
5       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
6       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
7       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
8       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
9       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
...

Extracting values from tags

Hacky, but ... hacky - with xmlcutty - mostly only for generating some lists out of text withing tags.

$ metha-sync https://bop.unibe.ch/baf/oai && \
    metha-cat https://bop.unibe.ch/baf/oai | \
    xmlcutty -path /Records/record/metadata/dc/identifier -rename '\n' | \
    grep ^http

https://bop.unibe.ch/baf/article/view/3666
https://bop.unibe.ch/baf/article/view/3667
https://bop.unibe.ch/baf/article/view/4195
https://bop.unibe.ch/baf/article/view/4211
https://bop.unibe.ch/baf/article/view/4454
https://bop.unibe.ch/baf/article/view/7197
https://bop.unibe.ch/baf/article/view/7275
https://bop.unibe.ch/baf/article/view/3355
https://bop.unibe.ch/baf/article/view/3498
...

Pure Go

Generate a Go struct that can handle a certain XML document structure. Via zek, -p generates a small example program to converts XML to JSON in a streaming fashion.

$ metha-sync https://bop.unibe.ch/baf/oai && \
    metha-cat https://bop.unibe.ch/baf/oai | \
    zek -p -j > example.go

$ metha-sync https://bop.unibe.ch/baf/oai && \
    metha-cat https://bop.unibe.ch/baf/oai | \
    GO111MODULE=off go run example.go  | jq .

{
  "XMLName": {
    "Space": "",
    "Local": "Records"
  },
  "Xsi": "http://www.w3.org/2001/XMLSchema-instance",
  "Record": [
    {
      "Text": "",
      "Xmlns": "http://www.openarchives.org/OAI/2.0/",
      "Header": {
        "Text": "",
        "Status": "deleted",
        "Identifier": "oai:ojs.bop.unibe.ch:article/2781",
        "Datestamp": "2016-06-02T13:46:06Z",
        "SetSpec": "baf:ART"
      },
      "Metadata": {
        "Text": "",

package main
import (
"encoding/json"
"encoding/xml"
"fmt"
"log"
"os"
"golang.org/x/net/html/charset"
)
// Records was generated 2022-06-28 16:01:40 by tir on trieste.
type Records struct {
XMLName xml.Name `xml:"Records"`
Text string `xml:",chardata"`
Xsi string `xml:"xsi,attr"`
Record []struct {
Text string `xml:",chardata"`
Xmlns string `xml:"xmlns,attr"`
Header struct {
Text string `xml:",chardata"`
Status string `xml:"status,attr"`
Identifier string `xml:"identifier"`
Datestamp string `xml:"datestamp"`
SetSpec string `xml:"setSpec"`
} `xml:"header"`
Metadata struct {
Text string `xml:",chardata"`
Dc struct {
Text string `xml:",chardata"`
OaiDc string `xml:"oai_dc,attr"`
Dc string `xml:"dc,attr"`
Xsi string `xml:"xsi,attr"`
SchemaLocation string `xml:"schemaLocation,attr"`
Title struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"title"`
Creator []string `xml:"creator"`
Publisher struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"publisher"`
Date string `xml:"date"`
Type []struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"type"`
Format []string `xml:"format"`
Identifier []string `xml:"identifier"`
Source []struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"source"`
Language string `xml:"language"`
Relation []string `xml:"relation"`
Rights []struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"rights"`
Description struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
} `xml:"description"`
} `xml:"dc"`
} `xml:"metadata"`
About string `xml:"about"`
} `xml:"record"`
}
func main() {
dec := xml.NewDecoder(os.Stdin)
dec.CharsetReader = charset.NewReaderLabel
dec.Strict = false
var doc Records
if err := dec.Decode(&doc); err != nil {
log.Fatal(err)
}
b, err := json.Marshal(doc)
if err != nil {
log.Fatal(err)
}
fmt.Println(string(b))
}
import sys
import xml.etree.cElementTree as ET
def xmlstream(source, tag, skip=0, aggregate=False):
"""
Given a path to an XML file and a tag name (without namespace), stream
through the XML, and emit the element denoted by tag for processing, e.g.
via xmltodict, more convenient DOM parser or something else.
for snippet in xmlstream("sample.xml", "sometag"):
print(len(snippet))
The `skip` parameter is a hack that allows to skip an "end" event, i.e. to
wait for another. Use e.g. skip=1 if there are two nested XML tags with the
same name and you want to get the outer one.
The `aggregate` parameter is relevant only if skip > 0. If `aggregate` is
True, it will collect all matched tags and will return them as a tuple
(with the outermost element being the last).
"""
def strip_ns(tag):
if not '}' in tag:
return tag
return tag.split('}')[1]
# https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm
context = iter(ET.iterparse(source, events=(
'start',
'end',
)))
try:
_, root = next(context)
except StopIteration:
return
blobs, s = [], skip
for event, elem in context:
if not strip_ns(elem.tag) == tag or event == 'start':
continue
if s > 0:
if aggregate:
blobs.append(ET.tostring(elem))
s -= 1
continue
if aggregate:
blobs.append(ET.tostring(elem))
yield tuple(blobs)
else:
yield ET.tostring(elem)
root.clear()
blobs.clear()
s = skip
if __name__ == '__main__':
for i, blob in enumerate(xmlstream(sys.stdin, "record")):
# blob is just bytes or str of XML
snippet = "%s [...] %s" % (blob[:20].decode("utf-8"), blob[-30:].strip().decode("utf-8"))
print("%d\t%s" % (i, snippet))
# 0 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
# 1 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
# 2 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
# 3 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 4 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 5 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 6 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 7 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 8 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
# 9 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment