Last active
November 24, 2018 20:14
-
-
Save olastor/ad6d15dd721fd0c931997520e20d387a to your computer and use it in GitHub Desktop.
SuttaCentral: Merge html files from `de/pli/sutta/sn` into one big that can be converted to pdf or latex using pandoc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This script was placed in /sc-data/html_text/de/pli/sutta/sn (needs requirements `yarn add natural-sort glob ...`) | |
// Outputs book.html, a file of all merged text - (hopefully) correctly ordered. | |
// Minimal example for creating PDF: pandoc -o book.pdf --pdf-engine=xelatex --toc book.html | |
const nsort = require('natural-sort'); | |
const glob = require('glob'); | |
const fs = require('fs'); | |
const cheerio = require('cheerio'); | |
glob("**/*.html", (er, files) => { | |
let bookAsHtml = ''; | |
let currentChapter = ''; | |
files | |
.sort(nsort()) // apply natural number sorting | |
.map(file => { | |
const $ = cheerio.load(fs.readFileSync(file)); | |
// h1 -> h2 (make title subsection) | |
const title = $('.sutta h1').html(); | |
$('.sutta h1').replaceWith(`<h2>${title}</h2>`); | |
// add new h1 section if changed | |
const chapter = $('.sutta .hgroup > p').first().text(); | |
if (chapter && currentChapter !== chapter) { | |
bookAsHtml += `<h1>${chapter}</h1>`; | |
currentChapter = chapter; | |
} | |
const sutta = $('.sutta article').html(); | |
if (sutta) { | |
bookAsHtml += sutta; | |
} | |
}); | |
fs.writeFileSync('book.html', bookAsHtml); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment