Skip to content

Instantly share code, notes, and snippets.

@DataWraith
Last active October 5, 2015 08:13
Show Gist options
  • Save DataWraith/370ce4642c881f014a57 to your computer and use it in GitHub Desktop.
Save DataWraith/370ce4642c881f014a57 to your computer and use it in GitHub Desktop.
package main
// This program takes a .zim-file and dumps all contained articles below the
// current directory. I tested it with a Wikipedia snapshot; that did not
// contain deleted articles or LinkTargetEntrys, so I'm unsure how to handle
// those, for now I'm ignoring them.
//
// Redirects are handled by simply writing out the page pointed to by the
// redirect. IPFS deduplication should take care of it, so I think this is the
// most economical solution, even better than writing out small HTML files with
// redirect-instructions.
//
// Gozim occasionally crashes, so you may need to restart the program many
// times until you get all articles. It will continue from where the crash
// stopped it.
import (
"fmt"
"github.com/akhenakh/gozim"
"os"
"sort"
"strings"
)
const INPUT = "wikipedia_en_all_2015-05.zim"
func main() {
r, err := zim.NewReader(INPUT, true)
if err != nil {
panic(err)
}
defer r.Close()
// Find the first article that is not yet dumped
startIdx := sort.Search(int(r.ArticleCount), func(idx int) bool {
article, err := r.ArticleAtURLIdx(uint32(idx))
if err != nil {
panic(err)
}
url := article.FullURL()
if _, err := os.Stat(url); err == nil {
return false
}
return true
})
for i := uint32(startIdx); i < r.ArticleCount; i++ {
article, err := r.ArticleAtURLIdx(i)
if err != nil {
panic(err)
}
if article.EntryType == zim.DeletedEntry {
continue
}
if article.EntryType == zim.LinkTargetEntry {
continue
}
url := article.FullURL()
dir := url[0:strings.LastIndex(url, "/")]
err = os.MkdirAll(dir, os.ModePerm)
if err != nil {
panic(err)
}
if article.EntryType == zim.RedirectEntry {
fmt.Printf("Found redirect: %s", article.FullURL())
idx, err := article.RedirectIndex()
if err != nil {
panic(err)
}
article, err = r.ArticleAtURLIdx(idx)
if err != nil {
panic(err)
}
fmt.Println("-> ", article.FullURL())
}
data, err := article.Data()
if err != nil {
panic(err)
}
f, err := os.Create(url)
if err != nil {
panic(err)
}
_, err = f.Write(data)
if err != nil {
panic(err)
}
f.Close()
fmt.Println(url)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment