-
-
Save varun-jabong/676ecf513ddb79b62bd7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"os" | |
"encoding/xml" | |
"strings" | |
"log" | |
"runtime/debug" | |
) | |
type Article struct { | |
Title string `xml:"title"` | |
Url string `xml:"url"` | |
Abstract string `xml:"abstract"` | |
} | |
func CanonicalizeTitle(title string) string { | |
can := strings.ToLower(title) | |
//can = strings.Replace(can, " ", "_", -1) | |
//can = url.QueryEscape(can) | |
spl := strings.Split(can, ":") | |
can = spl[1] | |
return can | |
} | |
func readAbstract(filename string, article chan Article) { | |
defer close(article) | |
xmlFile, err := os.Open(filename) | |
if err != nil { | |
log.Printf("Error opening file:", err) | |
return | |
} | |
var inElement string | |
decoder := xml.NewDecoder(xmlFile) | |
for { | |
t, err := decoder.Token() | |
if err != nil { | |
log.Printf("Error while parsing -> %s", err) | |
debug.PrintStack() | |
continue | |
} | |
// Inspect the type of the token just read. | |
switch se := t.(type) { | |
case xml.StartElement: | |
// If we just read a StartElement token | |
inElement = se.Name.Local | |
// ...and its name is "page" | |
if inElement == "doc" { | |
var p Article | |
// decode a whole chunk of following XML into the | |
// variable p which is a Page (se above) | |
decoder.DecodeElement(&p, &se) | |
p.Title = CanonicalizeTitle(p.Title) | |
article <- p | |
} | |
default: | |
} | |
} | |
xmlFile.Close() | |
return | |
} | |
func main() { | |
d := make(chan Article, 400) | |
go readAbstract("enwiki-20140502-abstract.xml", d) | |
for r := range d { | |
log.Printf("%s", r.Title) | |
} | |
log.Printf("Reached here") | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment