Created
January 25, 2025 17:43
-
-
Save nirlanka/8a199223439b98e98562ae74391fc960 to your computer and use it in GitHub Desktop.
Extract html files from .epub files and move the content into .json files, including a list of files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"archive/zip" | |
"encoding/json" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"os" | |
"slices" | |
"strings" | |
) | |
func main() { | |
args := os.Args[1:] | |
var epubName string | |
if len(args) > 0 { | |
epubName = args[0] | |
} else { | |
return | |
} | |
fmt.Println("EPUB:", epubName) | |
err := extractPages(epubName) | |
if err == nil { | |
convertToJson(epubName) | |
} | |
} | |
// Error return codes: | |
func extractPages(fname string) error { | |
// Create folder: | |
dirname := fmt.Sprintf("epub_to_reader_uncompressed/%s.d", fname) | |
// Note: | |
// Adding `.d` extension for Mac OS to not conclude the folder as a `.epub` file, | |
// instead conclude as a folder. | |
err := os.MkdirAll(dirname, 0777) | |
// if err != nil { | |
// log.Fatalf("Failed to MkdirAll/nError%s\n", err) | |
// return err | |
// } | |
checkErr(err) | |
// Open reader: | |
r, err := zip.OpenReader(fname) | |
// if err != nil { | |
// log.Fatalf("Failed opening zip reader %s\nError: %s\n", fname, err) | |
// return err | |
// } | |
checkErr(err) | |
defer r.Close() | |
fmt.Printf("Unzipping...\n") | |
// Iterate through files: | |
for _, f := range r.File { | |
// fmt.Printf("Unzipping %s...\n", f.Name) | |
rc, err := f.Open() | |
// if err != nil { | |
// log.Fatalf("Failed to open file [n: %d]\nError: %s\n", k, err) | |
// return err | |
// } | |
checkErr(err) | |
defer rc.Close() | |
newFilePath := fmt.Sprintf("%s/%s", dirname, f.Name) | |
// fmt.Printf(newFilePath) | |
// File under `text/` folder: | |
if (strings.Index(f.Name, "text/") == 0) && (!f.FileInfo().IsDir()) { | |
//// Case: directory: | |
//if f.FileInfo().IsDir() { | |
// err = os.MkdirAll(newFilePath, 0777) | |
// // if err != nil { | |
// // log.Fatalf("Failed to MkdirAll/nError%s\n", err) | |
// // return err | |
// // } | |
// checkErr(err) | |
// fmt.Println(dirname) | |
textDirname := fmt.Sprintf("%s/text/", dirname) | |
os.MkdirAll(textDirname, 0777) | |
// continue | |
//} | |
// Case: file: | |
uncompressedFile, err := os.Create(newFilePath) | |
// if err != nil { | |
// log.Fatalf("Failed to create file [n: %d]\nError: %s\n", k, err) | |
// return err | |
// } | |
checkErr(err) | |
_, err = io.Copy(uncompressedFile, rc) | |
// if err != nil { | |
// log.Fatalf("Failed to copy file content [n: %d]\nError: %s\n", k, err) | |
// return err | |
// } | |
checkErr(err) | |
fmt.Printf("- %s", f.Name) | |
} | |
} | |
fmt.Printf("Unzipping... [DONE]\n") | |
return nil // success | |
} | |
func convertToJson(fname string) error { | |
epubDirname := fmt.Sprintf("epub_to_reader_uncompressed/%s.d/text", fname) | |
jsonDirname := fmt.Sprintf("epub_to_reader_output/%s.d", fname) | |
// Create folder: | |
err := os.MkdirAll(jsonDirname, 0777) | |
// if err != nil { | |
// log.Fatalf("Failed to MkdirAll: %s/nError%s\n", jsonDirname, err) | |
// return err | |
// } | |
checkErr(err) | |
// Read files: | |
files, err := ioutil.ReadDir(epubDirname) | |
// if err != nil { | |
// log.Fatalf("Failed to read uncompressed files\nError: %s\n", err) | |
// return err | |
// } | |
checkErr(err) | |
var filenames []string | |
for _, f := range files { | |
fname := f.Name() | |
if strings.Contains(fname, ".html") { | |
filenames = append(filenames, fname) | |
} | |
} | |
slices.Sort(filenames) | |
// DEBUG | |
fmt.Println("Sorted html filenames:") | |
fmt.Println(filenames) | |
var chapters [][]string | |
c := make(chan convertStatus) | |
for i, f := range filenames { | |
fid := i + 1 | |
fromFname := fmt.Sprintf("%s/%s", epubDirname, f) | |
toFname := fmt.Sprintf("%s/%d.json", jsonDirname, fid) | |
fnamePart := strings.Replace(f, ".html", "", 1) | |
go convertSingle(fid, fnamePart, fromFname, toFname, c) | |
chapterId := fmt.Sprintf("%d", fid) | |
chapterTitle := strings.Replace(f, ".html", "", 1) | |
chapter := []string{chapterId, chapterTitle} | |
chapters = append(chapters, chapter) | |
} | |
result := make([]convertStatus, len(filenames)) | |
for i := range result { | |
result[i] = <-c | |
if result[i].status { | |
fmt.Println("[ok]", fname) | |
} else { | |
fmt.Println("[err]", fname) | |
} | |
} | |
indexFname := fmt.Sprintf("%s/index.json", jsonDirname) | |
jobj := jsonIndex{ | |
fname, | |
chapters, | |
} | |
jbytes, _ := json.Marshal(jobj) | |
err = os.WriteFile(indexFname, jbytes, 0644) | |
checkErr(err) | |
return nil | |
} | |
func convertSingle(fid int, fnamePart string, fromFname string, toFname string, c chan convertStatus) { | |
htmlContent, errRead := os.ReadFile(fromFname) | |
// checkErr(err) | |
jobj := jsonChapter{ | |
fnamePart, | |
string(htmlContent), | |
} | |
jbytes, _ := json.Marshal(jobj) | |
errWrite := os.WriteFile(toFname, jbytes, 0644) | |
// checkErr(err) | |
if errRead != nil || errWrite != nil { | |
c <- convertStatus{fnamePart, fid, false} | |
} else { | |
c <- convertStatus{fnamePart, fid, true} | |
} | |
} | |
func checkErr(err error) { | |
if err != nil { | |
panic(err) | |
} | |
} | |
type convertStatus struct { | |
fname string | |
fid int | |
status bool | |
} | |
type jsonChapter struct { | |
Title string | |
FullText string | |
} | |
type jsonIndex struct { | |
Title string | |
Chapters [][]string // [[1,"chapter 1", [2, "chapter 2"]]] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment