Skip to content

Instantly share code, notes, and snippets.

@PaulBradley
Last active July 3, 2025 08:36
Show Gist options
  • Save PaulBradley/64aced2ba268c28fea7738b7900d3e13 to your computer and use it in GitHub Desktop.
Save PaulBradley/64aced2ba268c28fea7738b7900d3e13 to your computer and use it in GitHub Desktop.
Golang code to remove HTML tag soup and slop from clinic letters

Accompanying aritlce showing how we reduced the storage requirements of a huge clinical database by removing legacy HTML inefficiencies - #TagSoup #Slop : https://bradley.software/article/optimizing-clinical-database-storage.html

package main

import (
	"bytes"
	"log"
	"os"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/andybalholm/brotli"

	"github.com/tdewolff/minify"
	"github.com/tdewolff/minify/css"
	"github.com/tdewolff/minify/html"
)

type Application struct {
	errorEncountered  error
	compressionBuffer bytes.Buffer
	htmlDocument      *goquery.Document
	htmlMinifier      *minify.M
	inputFileData     []byte
	compressedHTML    string
}

func main() {
	app := Application{}

	app.loadHTMLFile()
	app.parseHTML()
	app.replaceEmbeddedLogo()
	app.removeStyleAttributes()
	app.removeDuplicateMetaTags()
	app.removeEmptyParagraphTags()
	app.minifyHTML()
	app.addDoctype()
	app.writeHTMLOutput()
	app.compressHTML()
}

func (app *Application) addDoctype() {
	if !strings.HasPrefix(app.compressedHTML, "<!doctype") {
		app.compressedHTML = "<!doctype html>" + app.compressedHTML
	}
}

func (app *Application) configureHTMLMinifier() {
	app.htmlMinifier = minify.New()
	app.htmlMinifier.AddFunc("text/html", html.Minify)
	app.htmlMinifier.AddFunc("text/css", css.Minify)
	app.htmlMinifier.Add("text/html", &html.Minifier{
		KeepDefaultAttrVals: true,
		KeepDocumentTags:    true,
		KeepEndTags:         true,
		KeepWhitespace:      false,
	})
}

func (app *Application) loadHTMLFile() {
	app.inputFileData, app.errorEncountered = os.ReadFile("input.html")
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}
	log.Println("Input file size         :", len(string(app.inputFileData)), "bytes")
}

func (app *Application) minifyHTML() {
	var html string

	app.configureHTMLMinifier()

	html, app.errorEncountered = app.htmlDocument.Html()
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}

	app.compressedHTML, app.errorEncountered = app.htmlMinifier.String("text/html", html)
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}

	app.compressedHTML = strings.TrimSpace(app.compressedHTML)
	app.compressedHTML = strings.ReplaceAll(app.compressedHTML, "\u00A0", "")

	log.Println("Smushed file size       :", len(app.compressedHTML), "bytes")
}

func (app *Application) parseHTML() {
	var sr *strings.Reader = strings.NewReader(string(app.inputFileData))
	app.htmlDocument, app.errorEncountered = goquery.NewDocumentFromReader(sr)
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}
}

func (app *Application) removeDuplicateMetaTags() {
	app.htmlDocument.Find("meta[http-equiv]").Remove()
}

func (app *Application) removeEmptyParagraphTags() {
	app.htmlDocument.Find("p").Each(
		func(i int, s *goquery.Selection) {
			if strings.TrimSpace(s.Text()) == "" {
				s.Remove()
			}
		},
	)
}

func (app *Application) removeStyleAttributes() {
	app.htmlDocument.Find("*").Each(func(i int, s *goquery.Selection) {
		s.RemoveAttr("style")
	})
}

func (app *Application) replaceEmbeddedLogo() {
	app.htmlDocument.Find("img[src^='data:image/jpg;']").First().ReplaceWithHtml(
		`<img id="logo" src="https://nhs.io/logo.jpg" width="718" height="89" >`)
}

func (app *Application) writeHTMLOutput() {
	app.errorEncountered = os.WriteFile(
		"output.html", []byte(app.compressedHTML), 0644)
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}
}

func (app *Application) compressHTML() {
	app.compressionBuffer, app.errorEncountered = app.compressWithBrotli([]byte(app.compressedHTML))
	if app.errorEncountered != nil {
		panic(app.errorEncountered.Error())
	}
	log.Println("With Brotli Compression :", len(app.compressionBuffer.Bytes()), "bytes")
}

func (app *Application) compressWithBrotli(input []byte) (bytes.Buffer, error) {
	var encoder *brotli.Writer
	var err error
	var tmpBuffer bytes.Buffer

	encoder = brotli.NewWriterLevel(
		&tmpBuffer,
		brotli.BestCompression)

	_, err = encoder.Write(input)
	if err != nil {
		return tmpBuffer, err
	}

	if err := encoder.Close(); err != nil {
		return tmpBuffer, err
	}

	return tmpBuffer, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment