Last active
October 5, 2015 08:13
-
-
Save DataWraith/370ce4642c881f014a57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// This program takes a .zim-file and dumps all contained articles below the | |
// current directory. I tested it with a Wikipedia snapshot; that did not | |
// contain deleted articles or LinkTargetEntrys, so I'm unsure how to handle | |
// those, for now I'm ignoring them. | |
// | |
// Redirects are handled by simply writing out the page pointed to by the | |
// redirect. IPFS deduplication should take care of it, so I think this is the | |
// most economical solution, even better than writing out small HTML files with | |
// redirect-instructions. | |
// | |
// Gozim occasionally crashes, so you may need to restart the program many | |
// times until you get all articles. It will continue from where the crash | |
// stopped it. | |
import ( | |
"fmt" | |
"github.com/akhenakh/gozim" | |
"os" | |
"sort" | |
"strings" | |
) | |
const INPUT = "wikipedia_en_all_2015-05.zim" | |
func main() { | |
r, err := zim.NewReader(INPUT, true) | |
if err != nil { | |
panic(err) | |
} | |
defer r.Close() | |
// Find the first article that is not yet dumped | |
startIdx := sort.Search(int(r.ArticleCount), func(idx int) bool { | |
article, err := r.ArticleAtURLIdx(uint32(idx)) | |
if err != nil { | |
panic(err) | |
} | |
url := article.FullURL() | |
if _, err := os.Stat(url); err == nil { | |
return false | |
} | |
return true | |
}) | |
for i := uint32(startIdx); i < r.ArticleCount; i++ { | |
article, err := r.ArticleAtURLIdx(i) | |
if err != nil { | |
panic(err) | |
} | |
if article.EntryType == zim.DeletedEntry { | |
continue | |
} | |
if article.EntryType == zim.LinkTargetEntry { | |
continue | |
} | |
url := article.FullURL() | |
dir := url[0:strings.LastIndex(url, "/")] | |
err = os.MkdirAll(dir, os.ModePerm) | |
if err != nil { | |
panic(err) | |
} | |
if article.EntryType == zim.RedirectEntry { | |
fmt.Printf("Found redirect: %s", article.FullURL()) | |
idx, err := article.RedirectIndex() | |
if err != nil { | |
panic(err) | |
} | |
article, err = r.ArticleAtURLIdx(idx) | |
if err != nil { | |
panic(err) | |
} | |
fmt.Println("-> ", article.FullURL()) | |
} | |
data, err := article.Data() | |
if err != nil { | |
panic(err) | |
} | |
f, err := os.Create(url) | |
if err != nil { | |
panic(err) | |
} | |
_, err = f.Write(data) | |
if err != nil { | |
panic(err) | |
} | |
f.Close() | |
fmt.Println(url) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment