Last active
October 5, 2019 20:05
-
-
Save shal/dfa76c0da9b8cf33554f67b3fa3e3538 to your computer and use it in GitHub Desktop.
Parsing of big XML files using stream of tokens
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Parser is a command line tool for parsing big XML file. | |
// Author: Ali Shanaakh <[email protected]> | |
// Usage: go run parse.go -path=./15-ufop/15.1-EX_XML_EDR_UO_03.10.2019.xml | |
package main | |
import ( | |
"encoding/xml" | |
"flag" | |
"fmt" | |
"io" | |
"log" | |
"os" | |
"time" | |
"golang.org/x/text/encoding/charmap" | |
) | |
type Founder struct { | |
Founder string `xml:"FOUNDER"` | |
} | |
type Record struct { | |
EDRPOU string `xml:"EDRPOU"` | |
KVED string `xml:"KVED"` | |
Boss string `xml:"BOSS"` | |
Stan string `xml:"STAN"` | |
ShortName string `xml:"SHORT_NAME"` | |
Name string `xml:"NAME"` | |
Address string `xml:"ADDRESS"` | |
FoundingDocumentNum string `xml:"FOUNDING_DOCUMENT_NUM"` | |
Founders []Founder `xml:"FOUNDERS"` | |
} | |
var ( | |
path = flag.String("path", "", "Path to XML file") | |
) | |
func windows1251(charset string, input io.Reader) (io.Reader, error) { | |
switch charset { | |
case "windows-1251": | |
return charmap.Windows1251.NewDecoder().Reader(input), nil | |
default: | |
return nil, fmt.Errorf("unknown charset: %s", charset) | |
} | |
} | |
func main() { | |
flag.Parse() | |
start := time.Now() | |
f, err := os.Open(*path) | |
if err != nil { | |
log.Fatal(err) | |
} | |
stats := make(map[string]int) | |
decoder := xml.NewDecoder(f) | |
decoder.CharsetReader = windows1251 | |
for { | |
// Read tokens from the XML document in a stream. | |
t, _ := decoder.Token() | |
if t == nil { | |
break | |
} | |
// Inspect the type of the token just read. | |
switch token := t.(type) { | |
case xml.StartElement: | |
stats[token.Name.Local]++ | |
if token.Name.Local == "RECORD" { | |
var record Record | |
decoder.DecodeElement(&record, &token) | |
} | |
} | |
} | |
for k, v := range stats { | |
log.Println(k, v) | |
} | |
log.Println("Time of execution", time.Since(start)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment