Accompanying aritlce showing how we reduced the storage requirements of a huge clinical database by removing legacy HTML inefficiencies - #TagSoup #Slop : https://bradley.software/article/optimizing-clinical-database-storage.html
package main
import (
"bytes"
"log"
"os"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/andybalholm/brotli"
"github.com/tdewolff/minify"
"github.com/tdewolff/minify/css"
"github.com/tdewolff/minify/html"
)
type Application struct {
errorEncountered error
compressionBuffer bytes.Buffer
htmlDocument *goquery.Document
htmlMinifier *minify.M
inputFileData []byte
compressedHTML string
}
func main() {
app := Application{}
app.loadHTMLFile()
app.parseHTML()
app.replaceEmbeddedLogo()
app.removeStyleAttributes()
app.removeDuplicateMetaTags()
app.removeEmptyParagraphTags()
app.minifyHTML()
app.addDoctype()
app.writeHTMLOutput()
app.compressHTML()
}
func (app *Application) addDoctype() {
if !strings.HasPrefix(app.compressedHTML, "<!doctype") {
app.compressedHTML = "<!doctype html>" + app.compressedHTML
}
}
func (app *Application) configureHTMLMinifier() {
app.htmlMinifier = minify.New()
app.htmlMinifier.AddFunc("text/html", html.Minify)
app.htmlMinifier.AddFunc("text/css", css.Minify)
app.htmlMinifier.Add("text/html", &html.Minifier{
KeepDefaultAttrVals: true,
KeepDocumentTags: true,
KeepEndTags: true,
KeepWhitespace: false,
})
}
func (app *Application) loadHTMLFile() {
app.inputFileData, app.errorEncountered = os.ReadFile("input.html")
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
log.Println("Input file size :", len(string(app.inputFileData)), "bytes")
}
func (app *Application) minifyHTML() {
var html string
app.configureHTMLMinifier()
html, app.errorEncountered = app.htmlDocument.Html()
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
app.compressedHTML, app.errorEncountered = app.htmlMinifier.String("text/html", html)
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
app.compressedHTML = strings.TrimSpace(app.compressedHTML)
app.compressedHTML = strings.ReplaceAll(app.compressedHTML, "\u00A0", "")
log.Println("Smushed file size :", len(app.compressedHTML), "bytes")
}
func (app *Application) parseHTML() {
var sr *strings.Reader = strings.NewReader(string(app.inputFileData))
app.htmlDocument, app.errorEncountered = goquery.NewDocumentFromReader(sr)
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
}
func (app *Application) removeDuplicateMetaTags() {
app.htmlDocument.Find("meta[http-equiv]").Remove()
}
func (app *Application) removeEmptyParagraphTags() {
app.htmlDocument.Find("p").Each(
func(i int, s *goquery.Selection) {
if strings.TrimSpace(s.Text()) == "" {
s.Remove()
}
},
)
}
func (app *Application) removeStyleAttributes() {
app.htmlDocument.Find("*").Each(func(i int, s *goquery.Selection) {
s.RemoveAttr("style")
})
}
func (app *Application) replaceEmbeddedLogo() {
app.htmlDocument.Find("img[src^='data:image/jpg;']").First().ReplaceWithHtml(
`<img id="logo" src="https://nhs.io/logo.jpg" width="718" height="89" >`)
}
func (app *Application) writeHTMLOutput() {
app.errorEncountered = os.WriteFile(
"output.html", []byte(app.compressedHTML), 0644)
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
}
func (app *Application) compressHTML() {
app.compressionBuffer, app.errorEncountered = app.compressWithBrotli([]byte(app.compressedHTML))
if app.errorEncountered != nil {
panic(app.errorEncountered.Error())
}
log.Println("With Brotli Compression :", len(app.compressionBuffer.Bytes()), "bytes")
}
func (app *Application) compressWithBrotli(input []byte) (bytes.Buffer, error) {
var encoder *brotli.Writer
var err error
var tmpBuffer bytes.Buffer
encoder = brotli.NewWriterLevel(
&tmpBuffer,
brotli.BestCompression)
_, err = encoder.Write(input)
if err != nil {
return tmpBuffer, err
}
if err := encoder.Close(); err != nil {
return tmpBuffer, err
}
return tmpBuffer, nil
}