Last active
December 25, 2024 00:58
-
-
Save alessiosavi/26d0a8da7a9fedaccadec61c23c5bd57 to your computer and use it in GitHub Desktop.
WikiExtractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
bz2 "compress/bzip2" | |
"encoding/xml" | |
"fmt" | |
"github.com/schollz/progressbar/v3" | |
"golang.org/x/net/html" | |
"io/fs" | |
"log" | |
"os" | |
"path/filepath" | |
"regexp" | |
"strings" | |
) | |
type Page struct { | |
Title string `xml:"title"` | |
Ns int `xml:"ns"` | |
ID int64 `xml:"id"` | |
Text struct { | |
Value string `xml:",chardata"` | |
} `xml:"revision>text"` | |
} | |
type WikiDump struct { | |
Pages []Page `xml:"page"` | |
} | |
func main() { | |
var files []string | |
filepath.WalkDir("/tmp/wikidata", func(path string, d fs.DirEntry, err error) error { | |
files = append(files, path) | |
return nil | |
}) | |
bar1 := progressbar.Default(int64(len(files))) | |
for _, filePath := range files { | |
file, err := os.Open(filePath) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer file.Close() | |
bz2Reader := bz2.NewReader(file) | |
decoder := xml.NewDecoder(bz2Reader) | |
bar := progressbar.Default(-1, filepath.Base(filePath)) | |
// Find the <page> elements and process them individually | |
for { | |
t, _ := decoder.Token() | |
if t == nil { | |
break | |
} | |
switch se := t.(type) { | |
case xml.StartElement: | |
if se.Name.Local == "page" { | |
bar.Add(1) | |
var page Page | |
err := decoder.DecodeElement(&page, &se) | |
if err != nil { | |
log.Printf("Error decoding page: %v", err) | |
panic(err) | |
} | |
if page.Ns == 0 { // Only process articles (Namespace 0) | |
if len(page.Text.Value) <= 256 { | |
continue | |
} | |
//text := cleanText(page.Text.Value) | |
text := page.Text.Value | |
// Process the extracted text as needed (e.g., save to file, index, etc.) | |
os.WriteFile(fmt.Sprintf("/tmp/wikidata_parsed/%s.txt", page.Title), []byte(text), 0644) | |
} | |
} | |
} | |
} | |
bar1.Add(1) | |
} | |
} | |
var reg1 = regexp.MustCompile(`\[\[(.*?\|?)+]]`) | |
var reg2 = regexp.MustCompile(`\{\{(.*?\|?)+}}`) | |
func cleanText(text string) string { | |
// Remove MediaWiki markup (very basic example) | |
text = reg1.ReplaceAllString(text, "$1") | |
text = reg2.ReplaceAllString(text, "$1") | |
// Remove HTML tags (using net/html) | |
doc, err := html.Parse(strings.NewReader(text)) | |
if err != nil { | |
return text | |
} | |
var buf strings.Builder | |
var f func(*html.Node) | |
f = func(n *html.Node) { | |
if n.Type == html.TextNode { | |
buf.WriteString(n.Data) | |
} | |
for c := n.FirstChild; c != nil; c = c.NextSibling { | |
f(c) | |
} | |
} | |
f(doc) | |
text = strings.TrimSpace(buf.String()) | |
text = strings.Replace(text, "[[", "", -1) | |
text = strings.Replace(text, "]]", "", -1) | |
text = strings.Replace(text, "{{", "", -1) | |
text = strings.Replace(text, "}}", "", -1) | |
return strings.TrimSpace(text) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment