Created
June 2, 2014 12:56
-
-
Save djinn/14e7bd77ce05d49297cf to your computer and use it in GitHub Desktop.
XML parsing in golang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"os" | |
"encoding/xml" | |
"strings" | |
"log" | |
"runtime/debug" | |
) | |
type Article struct { | |
Title string `xml:"title"` | |
Url string `xml:"url"` | |
Abstract string `xml:"abstract"` | |
} | |
func CanonicalizeTitle(title string) string { | |
can := strings.ToLower(title) | |
//can = strings.Replace(can, " ", "_", -1) | |
//can = url.QueryEscape(can) | |
spl := strings.Split(can, ":") | |
can = spl[1] | |
return can | |
} | |
func readAbstract(filename string, article chan Article) { | |
defer close(article) | |
xmlFile, err := os.Open(filename) | |
if err != nil { | |
log.Printf("Error opening file:", err) | |
return | |
} | |
var inElement string | |
decoder := xml.NewDecoder(xmlFile) | |
for { | |
t, err := decoder.Token() | |
if err != nil { | |
log.Printf("Error while parsing -> %s", err) | |
debug.PrintStack() | |
continue | |
} | |
// Inspect the type of the token just read. | |
switch se := t.(type) { | |
case xml.StartElement: | |
// If we just read a StartElement token | |
inElement = se.Name.Local | |
// ...and its name is "page" | |
if inElement == "doc" { | |
var p Article | |
// decode a whole chunk of following XML into the | |
// variable p which is a Page (se above) | |
decoder.DecodeElement(&p, &se) | |
p.Title = CanonicalizeTitle(p.Title) | |
article <- p | |
} | |
default: | |
} | |
} | |
xmlFile.Close() | |
return | |
} | |
func main() { | |
d := make(chan Article, 400) | |
go readAbstract("enwiki-20140502-abstract.xml", d) | |
for r := range d { | |
log.Printf("%s", r.Title) | |
} | |
log.Printf("Reached here") | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Parsing Wikimedia Abstract.xml in golang