Skip to content

Instantly share code, notes, and snippets.

@nirlanka
Created January 25, 2025 17:43
Show Gist options
  • Save nirlanka/8a199223439b98e98562ae74391fc960 to your computer and use it in GitHub Desktop.
Save nirlanka/8a199223439b98e98562ae74391fc960 to your computer and use it in GitHub Desktop.
Extract html files from .epub files and move the content into .json files, including a list of files
package main
import (
"archive/zip"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"slices"
"strings"
)
func main() {
args := os.Args[1:]
var epubName string
if len(args) > 0 {
epubName = args[0]
} else {
return
}
fmt.Println("EPUB:", epubName)
err := extractPages(epubName)
if err == nil {
convertToJson(epubName)
}
}
// Error return codes:
func extractPages(fname string) error {
// Create folder:
dirname := fmt.Sprintf("epub_to_reader_uncompressed/%s.d", fname)
// Note:
// Adding `.d` extension for Mac OS to not conclude the folder as a `.epub` file,
// instead conclude as a folder.
err := os.MkdirAll(dirname, 0777)
// if err != nil {
// log.Fatalf("Failed to MkdirAll/nError%s\n", err)
// return err
// }
checkErr(err)
// Open reader:
r, err := zip.OpenReader(fname)
// if err != nil {
// log.Fatalf("Failed opening zip reader %s\nError: %s\n", fname, err)
// return err
// }
checkErr(err)
defer r.Close()
fmt.Printf("Unzipping...\n")
// Iterate through files:
for _, f := range r.File {
// fmt.Printf("Unzipping %s...\n", f.Name)
rc, err := f.Open()
// if err != nil {
// log.Fatalf("Failed to open file [n: %d]\nError: %s\n", k, err)
// return err
// }
checkErr(err)
defer rc.Close()
newFilePath := fmt.Sprintf("%s/%s", dirname, f.Name)
// fmt.Printf(newFilePath)
// File under `text/` folder:
if (strings.Index(f.Name, "text/") == 0) && (!f.FileInfo().IsDir()) {
//// Case: directory:
//if f.FileInfo().IsDir() {
// err = os.MkdirAll(newFilePath, 0777)
// // if err != nil {
// // log.Fatalf("Failed to MkdirAll/nError%s\n", err)
// // return err
// // }
// checkErr(err)
// fmt.Println(dirname)
textDirname := fmt.Sprintf("%s/text/", dirname)
os.MkdirAll(textDirname, 0777)
// continue
//}
// Case: file:
uncompressedFile, err := os.Create(newFilePath)
// if err != nil {
// log.Fatalf("Failed to create file [n: %d]\nError: %s\n", k, err)
// return err
// }
checkErr(err)
_, err = io.Copy(uncompressedFile, rc)
// if err != nil {
// log.Fatalf("Failed to copy file content [n: %d]\nError: %s\n", k, err)
// return err
// }
checkErr(err)
fmt.Printf("- %s", f.Name)
}
}
fmt.Printf("Unzipping... [DONE]\n")
return nil // success
}
func convertToJson(fname string) error {
epubDirname := fmt.Sprintf("epub_to_reader_uncompressed/%s.d/text", fname)
jsonDirname := fmt.Sprintf("epub_to_reader_output/%s.d", fname)
// Create folder:
err := os.MkdirAll(jsonDirname, 0777)
// if err != nil {
// log.Fatalf("Failed to MkdirAll: %s/nError%s\n", jsonDirname, err)
// return err
// }
checkErr(err)
// Read files:
files, err := ioutil.ReadDir(epubDirname)
// if err != nil {
// log.Fatalf("Failed to read uncompressed files\nError: %s\n", err)
// return err
// }
checkErr(err)
var filenames []string
for _, f := range files {
fname := f.Name()
if strings.Contains(fname, ".html") {
filenames = append(filenames, fname)
}
}
slices.Sort(filenames)
// DEBUG
fmt.Println("Sorted html filenames:")
fmt.Println(filenames)
var chapters [][]string
c := make(chan convertStatus)
for i, f := range filenames {
fid := i + 1
fromFname := fmt.Sprintf("%s/%s", epubDirname, f)
toFname := fmt.Sprintf("%s/%d.json", jsonDirname, fid)
fnamePart := strings.Replace(f, ".html", "", 1)
go convertSingle(fid, fnamePart, fromFname, toFname, c)
chapterId := fmt.Sprintf("%d", fid)
chapterTitle := strings.Replace(f, ".html", "", 1)
chapter := []string{chapterId, chapterTitle}
chapters = append(chapters, chapter)
}
result := make([]convertStatus, len(filenames))
for i := range result {
result[i] = <-c
if result[i].status {
fmt.Println("[ok]", fname)
} else {
fmt.Println("[err]", fname)
}
}
indexFname := fmt.Sprintf("%s/index.json", jsonDirname)
jobj := jsonIndex{
fname,
chapters,
}
jbytes, _ := json.Marshal(jobj)
err = os.WriteFile(indexFname, jbytes, 0644)
checkErr(err)
return nil
}
func convertSingle(fid int, fnamePart string, fromFname string, toFname string, c chan convertStatus) {
htmlContent, errRead := os.ReadFile(fromFname)
// checkErr(err)
jobj := jsonChapter{
fnamePart,
string(htmlContent),
}
jbytes, _ := json.Marshal(jobj)
errWrite := os.WriteFile(toFname, jbytes, 0644)
// checkErr(err)
if errRead != nil || errWrite != nil {
c <- convertStatus{fnamePart, fid, false}
} else {
c <- convertStatus{fnamePart, fid, true}
}
}
func checkErr(err error) {
if err != nil {
panic(err)
}
}
type convertStatus struct {
fname string
fid int
status bool
}
type jsonChapter struct {
Title string
FullText string
}
type jsonIndex struct {
Title string
Chapters [][]string // [[1,"chapter 1", [2, "chapter 2"]]]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment