miku · July 4, 2022 07:39
diff --git a/README.md b/README.md
diff --git a/example.go b/example.go
 package main

 import (
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
 	"log"
 	"os"

 	"golang.org/x/net/html/charset"
 )

 // Records was generated 2022-06-28 16:01:40 by tir on trieste.
 type Records struct {
 	XMLName xml.Name `xml:"Records"`
 	Text    string   `xml:",chardata"`
 	Xsi     string   `xml:"xsi,attr"`
 	Record  []struct {
 		Text   string `xml:",chardata"`
 		Xmlns  string `xml:"xmlns,attr"`
 		Header struct {
 			Text       string `xml:",chardata"`
 			Status     string `xml:"status,attr"`
 			Identifier string `xml:"identifier"`
 			Datestamp  string `xml:"datestamp"`
 			SetSpec    string `xml:"setSpec"`
 		} `xml:"header"`
 		Metadata struct {
 			Text string `xml:",chardata"`
 			Dc   struct {
 				Text           string `xml:",chardata"`
 				OaiDc          string `xml:"oai_dc,attr"`
 				Dc             string `xml:"dc,attr"`
 				Xsi            string `xml:"xsi,attr"`
 				SchemaLocation string `xml:"schemaLocation,attr"`
 				Title          struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"title"`
 				Creator   []string `xml:"creator"`
 				Publisher struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"publisher"`
 				Date string `xml:"date"`
 				Type []struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"type"`
 				Format     []string `xml:"format"`
 				Identifier []string `xml:"identifier"`
 				Source     []struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"source"`
 				Language string   `xml:"language"`
 				Relation []string `xml:"relation"`
 				Rights   []struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"rights"`
 				Description struct {
 					Text string `xml:",chardata"`
 					Lang string `xml:"lang,attr"`
 				} `xml:"description"`
 			} `xml:"dc"`
 		} `xml:"metadata"`
 		About string `xml:"about"`
 	} `xml:"record"`
 }

 func main() {
 	dec := xml.NewDecoder(os.Stdin)
 	dec.CharsetReader = charset.NewReaderLabel
 	dec.Strict = false

 	var doc Records
 	if err := dec.Decode(&doc); err != nil {
 		log.Fatal(err)
 	}
 	b, err := json.Marshal(doc)
 	if err != nil {
 		log.Fatal(err)
 	}
 	fmt.Println(string(b))
 }
diff --git a/xstream.py b/xstream.py
 import sys
 import xml.etree.cElementTree as ET

 def xmlstream(source, tag, skip=0, aggregate=False):
    """
    Given a path to an XML file and a tag name (without namespace), stream
    through the XML, and emit the element denoted by tag for processing, e.g.
    via xmltodict, more convenient DOM parser or something else.
        for snippet in xmlstream("sample.xml", "sometag"):
            print(len(snippet))
    The `skip` parameter is a hack that allows to skip an "end" event, i.e. to
    wait for another. Use e.g. skip=1 if there are two nested XML tags with the
    same name and you want to get the outer one.
    The `aggregate` parameter is relevant only if skip > 0. If `aggregate` is
    True, it will collect all matched tags and will return them as a tuple
    (with the outermost element being the last).
    """
    def strip_ns(tag):
        if not '}' in tag:
            return tag
        return tag.split('}')[1]

    # https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm
    context = iter(ET.iterparse(source, events=(
        'start',
        'end',
    )))
    try:
        _, root = next(context)
    except StopIteration:
        return

    blobs, s = [], skip

    for event, elem in context:
        if not strip_ns(elem.tag) == tag or event == 'start':
            continue

        if s > 0:
            if aggregate:
                blobs.append(ET.tostring(elem))
            s -= 1
            continue

        if aggregate:
            blobs.append(ET.tostring(elem))
            yield tuple(blobs)
        else:
            yield ET.tostring(elem)

        root.clear()
        blobs.clear()
        s = skip

 if __name__ == '__main__':
    for i, blob in enumerate(xmlstream(sys.stdin, "record")):
        # blob is just bytes or str of XML
        snippet = "%s [...] %s" % (blob[:20].decode("utf-8"), blob[-30:].strip().decode("utf-8"))
        print("%d\t%s" % (i, snippet))
        # 0       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
        # 1       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
        # 2       <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
        # 3       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 4       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 5       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 6       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 7       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 8       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
        # 9       <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	package main

	import (
	"encoding/json"
	"encoding/xml"
	"fmt"
	"log"
	"os"

	"golang.org/x/net/html/charset"
	)

	// Records was generated 2022-06-28 16:01:40 by tir on trieste.
	type Records struct {
	XMLName xml.Name `xml:"Records"`
	Text string `xml:",chardata"`
	Xsi string `xml:"xsi,attr"`
	Record []struct {
	Text string `xml:",chardata"`
	Xmlns string `xml:"xmlns,attr"`
	Header struct {
	Text string `xml:",chardata"`
	Status string `xml:"status,attr"`
	Identifier string `xml:"identifier"`
	Datestamp string `xml:"datestamp"`
	SetSpec string `xml:"setSpec"`
	} `xml:"header"`
	Metadata struct {
	Text string `xml:",chardata"`
	Dc struct {
	Text string `xml:",chardata"`
	OaiDc string `xml:"oai_dc,attr"`
	Dc string `xml:"dc,attr"`
	Xsi string `xml:"xsi,attr"`
	SchemaLocation string `xml:"schemaLocation,attr"`
	Title struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"title"`
	Creator []string `xml:"creator"`
	Publisher struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"publisher"`
	Date string `xml:"date"`
	Type []struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"type"`
	Format []string `xml:"format"`
	Identifier []string `xml:"identifier"`
	Source []struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"source"`
	Language string `xml:"language"`
	Relation []string `xml:"relation"`
	Rights []struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"rights"`
	Description struct {
	Text string `xml:",chardata"`
	Lang string `xml:"lang,attr"`
	} `xml:"description"`
	} `xml:"dc"`
	} `xml:"metadata"`
	About string `xml:"about"`
	} `xml:"record"`
	}

	func main() {
	dec := xml.NewDecoder(os.Stdin)
	dec.CharsetReader = charset.NewReaderLabel
	dec.Strict = false

	var doc Records
	if err := dec.Decode(&doc); err != nil {
	log.Fatal(err)
	}
	b, err := json.Marshal(doc)
	if err != nil {
	log.Fatal(err)
	}
	fmt.Println(string(b))
	}
	import sys
	import xml.etree.cElementTree as ET

	def xmlstream(source, tag, skip=0, aggregate=False):
	"""
	Given a path to an XML file and a tag name (without namespace), stream
	through the XML, and emit the element denoted by tag for processing, e.g.
	via xmltodict, more convenient DOM parser or something else.
	for snippet in xmlstream("sample.xml", "sometag"):
	print(len(snippet))
	The `skip` parameter is a hack that allows to skip an "end" event, i.e. to
	wait for another. Use e.g. skip=1 if there are two nested XML tags with the
	same name and you want to get the outer one.
	The `aggregate` parameter is relevant only if skip > 0. If `aggregate` is
	True, it will collect all matched tags and will return them as a tuple
	(with the outermost element being the last).
	"""
	def strip_ns(tag):
	if not '}' in tag:
	return tag
	return tag.split('}')[1]

	# https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm
	context = iter(ET.iterparse(source, events=(
	'start',
	'end',
	)))
	try:
	_, root = next(context)
	except StopIteration:
	return

	blobs, s = [], skip

	for event, elem in context:
	if not strip_ns(elem.tag) == tag or event == 'start':
	continue

	if s > 0:
	if aggregate:
	blobs.append(ET.tostring(elem))
	s -= 1
	continue

	if aggregate:
	blobs.append(ET.tostring(elem))
	yield tuple(blobs)
	else:
	yield ET.tostring(elem)

	root.clear()
	blobs.clear()
	s = skip

	if __name__ == '__main__':
	for i, blob in enumerate(xmlstream(sys.stdin, "record")):
	# blob is just bytes or str of XML
	snippet = "%s [...] %s" % (blob[:20].decode("utf-8"), blob[-30:].strip().decode("utf-8"))
	print("%d\t%s" % (i, snippet))
	# 0 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
	# 1 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
	# 2 <ns0:record xmlns:ns [...] /><ns0:about /></ns0:record>
	# 3 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 4 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 5 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 6 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 7 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 8 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>
	# 9 <ns0:record xmlns:dc [...] ta><ns0:about /></ns0:record>