Skip to content

Instantly share code, notes, and snippets.

@varun-jabong
Forked from djinn/abstract_parse.go
Last active August 29, 2015 14:18
Show Gist options
  • Save varun-jabong/676ecf513ddb79b62bd7 to your computer and use it in GitHub Desktop.
Save varun-jabong/676ecf513ddb79b62bd7 to your computer and use it in GitHub Desktop.
package main
import (
"os"
"encoding/xml"
"strings"
"log"
"runtime/debug"
)
type Article struct {
Title string `xml:"title"`
Url string `xml:"url"`
Abstract string `xml:"abstract"`
}
func CanonicalizeTitle(title string) string {
can := strings.ToLower(title)
//can = strings.Replace(can, " ", "_", -1)
//can = url.QueryEscape(can)
spl := strings.Split(can, ":")
can = spl[1]
return can
}
func readAbstract(filename string, article chan Article) {
defer close(article)
xmlFile, err := os.Open(filename)
if err != nil {
log.Printf("Error opening file:", err)
return
}
var inElement string
decoder := xml.NewDecoder(xmlFile)
for {
t, err := decoder.Token()
if err != nil {
log.Printf("Error while parsing -> %s", err)
debug.PrintStack()
continue
}
// Inspect the type of the token just read.
switch se := t.(type) {
case xml.StartElement:
// If we just read a StartElement token
inElement = se.Name.Local
// ...and its name is "page"
if inElement == "doc" {
var p Article
// decode a whole chunk of following XML into the
// variable p which is a Page (se above)
decoder.DecodeElement(&p, &se)
p.Title = CanonicalizeTitle(p.Title)
article <- p
}
default:
}
}
xmlFile.Close()
return
}
func main() {
d := make(chan Article, 400)
go readAbstract("enwiki-20140502-abstract.xml", d)
for r := range d {
log.Printf("%s", r.Title)
}
log.Printf("Reached here")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment