Skip to content

Instantly share code, notes, and snippets.

@shal
Last active October 5, 2019 20:05
Show Gist options
  • Save shal/dfa76c0da9b8cf33554f67b3fa3e3538 to your computer and use it in GitHub Desktop.
Save shal/dfa76c0da9b8cf33554f67b3fa3e3538 to your computer and use it in GitHub Desktop.
Parsing of big XML files using stream of tokens
// Parser is a command line tool for parsing big XML file.
// Author: Ali Shanaakh <[email protected]>
// Usage: go run parse.go -path=./15-ufop/15.1-EX_XML_EDR_UO_03.10.2019.xml
package main
import (
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"os"
"time"
"golang.org/x/text/encoding/charmap"
)
type Founder struct {
Founder string `xml:"FOUNDER"`
}
type Record struct {
EDRPOU string `xml:"EDRPOU"`
KVED string `xml:"KVED"`
Boss string `xml:"BOSS"`
Stan string `xml:"STAN"`
ShortName string `xml:"SHORT_NAME"`
Name string `xml:"NAME"`
Address string `xml:"ADDRESS"`
FoundingDocumentNum string `xml:"FOUNDING_DOCUMENT_NUM"`
Founders []Founder `xml:"FOUNDERS"`
}
var (
path = flag.String("path", "", "Path to XML file")
)
func windows1251(charset string, input io.Reader) (io.Reader, error) {
switch charset {
case "windows-1251":
return charmap.Windows1251.NewDecoder().Reader(input), nil
default:
return nil, fmt.Errorf("unknown charset: %s", charset)
}
}
func main() {
flag.Parse()
start := time.Now()
f, err := os.Open(*path)
if err != nil {
log.Fatal(err)
}
stats := make(map[string]int)
decoder := xml.NewDecoder(f)
decoder.CharsetReader = windows1251
for {
// Read tokens from the XML document in a stream.
t, _ := decoder.Token()
if t == nil {
break
}
// Inspect the type of the token just read.
switch token := t.(type) {
case xml.StartElement:
stats[token.Name.Local]++
if token.Name.Local == "RECORD" {
var record Record
decoder.DecodeElement(&record, &token)
}
}
}
for k, v := range stats {
log.Println(k, v)
}
log.Println("Time of execution", time.Since(start))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment