Skip to content

Instantly share code, notes, and snippets.

@alessiosavi
Last active December 25, 2024 00:58
Show Gist options
  • Save alessiosavi/26d0a8da7a9fedaccadec61c23c5bd57 to your computer and use it in GitHub Desktop.
Save alessiosavi/26d0a8da7a9fedaccadec61c23c5bd57 to your computer and use it in GitHub Desktop.
WikiExtractor
package main
import (
bz2 "compress/bzip2"
"encoding/xml"
"fmt"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
"io/fs"
"log"
"os"
"path/filepath"
"regexp"
"strings"
)
type Page struct {
Title string `xml:"title"`
Ns int `xml:"ns"`
ID int64 `xml:"id"`
Text struct {
Value string `xml:",chardata"`
} `xml:"revision>text"`
}
type WikiDump struct {
Pages []Page `xml:"page"`
}
func main() {
var files []string
filepath.WalkDir("/tmp/wikidata", func(path string, d fs.DirEntry, err error) error {
files = append(files, path)
return nil
})
bar1 := progressbar.Default(int64(len(files)))
for _, filePath := range files {
file, err := os.Open(filePath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
bz2Reader := bz2.NewReader(file)
decoder := xml.NewDecoder(bz2Reader)
bar := progressbar.Default(-1, filepath.Base(filePath))
// Find the <page> elements and process them individually
for {
t, _ := decoder.Token()
if t == nil {
break
}
switch se := t.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
bar.Add(1)
var page Page
err := decoder.DecodeElement(&page, &se)
if err != nil {
log.Printf("Error decoding page: %v", err)
panic(err)
}
if page.Ns == 0 { // Only process articles (Namespace 0)
if len(page.Text.Value) <= 256 {
continue
}
//text := cleanText(page.Text.Value)
text := page.Text.Value
// Process the extracted text as needed (e.g., save to file, index, etc.)
os.WriteFile(fmt.Sprintf("/tmp/wikidata_parsed/%s.txt", page.Title), []byte(text), 0644)
}
}
}
}
bar1.Add(1)
}
}
var reg1 = regexp.MustCompile(`\[\[(.*?\|?)+]]`)
var reg2 = regexp.MustCompile(`\{\{(.*?\|?)+}}`)
func cleanText(text string) string {
// Remove MediaWiki markup (very basic example)
text = reg1.ReplaceAllString(text, "$1")
text = reg2.ReplaceAllString(text, "$1")
// Remove HTML tags (using net/html)
doc, err := html.Parse(strings.NewReader(text))
if err != nil {
return text
}
var buf strings.Builder
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
buf.WriteString(n.Data)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
text = strings.TrimSpace(buf.String())
text = strings.Replace(text, "[[", "", -1)
text = strings.Replace(text, "]]", "", -1)
text = strings.Replace(text, "{{", "", -1)
text = strings.Replace(text, "}}", "", -1)
return strings.TrimSpace(text)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment