42LM · September 9, 2025 11:13
diff --git a/sanitize.go b/sanitize.go
 package sanitize

 import (
 	"html"
 	"reflect"
 	"regexp"
 	"unicode"
 	"unicode/utf8"

 	"github.com/microcosm-cc/bluemonday"
 )

 var filterChars = [256]uint8{
 	'\a': 1, // Alert or bell
 	'\b': 1, // Backspace
 	'\r': 1, // Carriage return
 	'\f': 1, // Form feed
 	'\t': 1, // Horizontal tab
 	'\n': 1, // Newline
 	' ':  1, // Space
 	'\v': 1, // Vertical tab
 }

 // Whitespace returns a slice of the string s with all leading and
 // trailing whitespace and all leading and trailing Explicit Formatting Type Unicode code points removed.
 func Whitespace(s string) string {
 	// Fast path for ASCII: look for the first ASCII non-whitespace byte
 	start := 0
 	for ; start < len(s); start++ {
 		c := s[start]
 		if c >= utf8.RuneSelf {
 			// If we run into a non-ASCII byte, fall back to the
 			// slower unicode-aware method on the remaining bytes
 			return whitespaceUnicode(s[start:])
 		}
 		if filterChars[c] == 0 {
 			break
 		}
 	}

 	// Now look for the first ASCII non-whitespace byte from the end
 	stop := len(s)
 	for ; stop > start; stop-- {
 		c := s[stop-1]
 		if c >= utf8.RuneSelf {
 			return whitespaceUnicode(s[start:stop])
 		}
 		if filterChars[c] == 0 {
 			break
 		}
 	}

 	return s[start:stop]
 }

 // Policy encapsulates the *bluemonday.Policy that holds
 // the allowlist of HTML elements and attributes that will
 // be applied to the sanitised HTML.
 type Policy struct {
 	p *bluemonday.Policy
 }

 // NewDefaultPolicy creates a blank policy with nothing allowed or permitted.
 func NewDefaultPolicy() *Policy {
 	return &Policy{
 		bluemonday.NewPolicy(),
 	}
 }

 // NewTemplatePolicy creates a policy that allows widely used elements to design email templates.
 // ⚠️❗WARNING: We trust CSS from any source.
 func NewTemplatePolicy() *Policy {
 	p := bluemonday.NewPolicy()
 	p.AllowTables()
 	p.AllowStandardAttributes()
 	p.AllowStyling()
 	p.AllowDataAttributes()
 	p.AllowComments()
 	p.AllowUnsafe(true)
 	p.AllowElements(
 		"html", "head", "title", "style", "body",
 		"input", "label", "video", "source", "link",
 		"span", "img", "div", "ol", "ul", "li", "br",
 	)
 	p.AllowAttrs(
 		"style", "border", "role", "align", "aria-hidden",
 		"width", "height", "important", "background", "tabindex",
 		"type", "src", "name", "cellspacing", "cellpadding",
 		"checked", "for", "rel", "alt",
 	).Globally()
 	p.AllowAttrs("href").Matching(regexp.MustCompile(`^(https|mailto):|#$`)).Globally()
 	p.AllowAttrs("target").Matching(regexp.MustCompile(`^_blank$`)).OnElements("a")
 	p.AllowAttrs("poster", "preload", "controls").OnElements("video")
 	p.AllowAttrs("http-equiv", "content", "charset").OnElements("meta")
 	p.AllowAttrs("xmlns", "xmlns:v", "xmlns:o", "xml:lang").OnElements("html")

 	return &Policy{
 		p,
 	}
 }

 // Sanitize takes a string that could contain HTML fragments and applies the given policy allowlist.
 // It returns a struct and its fields have been sanitized by the policy or an empty string
 // if an error has occurred (most likely as a consequence of extremely malformed input).
 func (p *Policy) Sanitize(a any) any {
 	t := reflect.TypeOf(a)
 	v := reflect.ValueOf(a)
 	newStruct := reflect.New(t)

 	if t.Kind() == reflect.Struct {
 		actualNewStruct := newStruct.Elem()

 		for i := 0; i < t.NumField(); i++ {
 			// `field` contains the `Name`, `Type` and the `Tag` (Tag = `json:"bla"`)
 			field := t.Field(i)
 			f := actualNewStruct.FieldByName(field.Name)
 			value := v.Field(i)

 			switch tmp := value.Interface().(type) {
 			case string:
 				unescapedHTML := html.UnescapeString(p.sanitize(tmp))
 				f.SetString(unescapedHTML)
 			case *string:
 				if tmp != nil {
 					sanitized := p.sanitize(*tmp)
 					unescapedHTML := html.UnescapeString(sanitized)
 					f.Set(reflect.ValueOf(&unescapedHTML))
 				} else {
 					f.Set(reflect.Zero(field.Type))
 					// f.Set(reflect.ValueOf(tmp))
 				}
 			case bool:
 				f.SetBool(tmp)
 			case *bool:
 				if tmp != nil {
 					f.Set(reflect.ValueOf(tmp))
 				} else {
 					f.Set(reflect.Zero(field.Type))
 				}
 			case map[string]string:
 				if tmp != nil {
 					f.Set(reflect.ValueOf(make(map[string]string, len(tmp))))
 					for k, v := range tmp {
 						unescapedHTML := html.UnescapeString(p.sanitize(v))
 						f.SetMapIndex(reflect.ValueOf(k), reflect.ValueOf(unescapedHTML))
 					}
 				} else {
 					f.Set(reflect.Zero(field.Type))
 				}
 			}
 		}
 	}

 	return newStruct.Interface()
 }

 // sanitize takes a string and applies the given policy allowlist.
 func (p *Policy) sanitize(s string) string {
 	return p.p.Sanitize(Whitespace(s))
 }

 // whitespaceUnicode returns a slice of the string s with all leading
 // and trailing Explicit Formatting Type Unicode code points removed.
 func whitespaceUnicode(s string) string {
 	runes := []rune(s)
 	start := 0

 	// if left side was already trimmed only trim right side
 	if !isValidChar(runes[start]) {
 		// Look for the first unicode non-explicit-formatting-type
 		for ; start < len(runes); start++ {
 			c := runes[start]
 			if isValidChar(c) {
 				break
 			}
 		}
 	}

 	// Now look for the first valid unicode non-explicit-formatting-type from the end
 	stop := len(runes)
 	for ; stop > start; stop-- {
 		c := runes[stop-1]
 		if isValidChar(c) {
 			break
 		}
 	}

 	return string(runes[start:stop])
 }

 // isValidChar checks if a unicode character code point is considered a valid character
 func isValidChar(c rune) bool {
 	if unicode.IsLetter(c) ||
 		unicode.IsPunct(c) ||
 		unicode.IsSymbol(c) ||
 		unicode.IsNumber(c) {
 		return true
 	}
 	return false
 }
	package sanitize

	import (
	"html"
	"reflect"
	"regexp"
	"unicode"
	"unicode/utf8"

	"github.com/microcosm-cc/bluemonday"
	)

	var filterChars = [256]uint8{
	'\a': 1, // Alert or bell
	'\b': 1, // Backspace
	'\r': 1, // Carriage return
	'\f': 1, // Form feed
	'\t': 1, // Horizontal tab
	'\n': 1, // Newline
	' ': 1, // Space
	'\v': 1, // Vertical tab
	}

	// Whitespace returns a slice of the string s with all leading and
	// trailing whitespace and all leading and trailing Explicit Formatting Type Unicode code points removed.
	func Whitespace(s string) string {
	// Fast path for ASCII: look for the first ASCII non-whitespace byte
	start := 0
	for ; start < len(s); start++ {
	c := s[start]
	if c >= utf8.RuneSelf {
	// If we run into a non-ASCII byte, fall back to the
	// slower unicode-aware method on the remaining bytes
	return whitespaceUnicode(s[start:])
	}
	if filterChars[c] == 0 {
	break
	}
	}

	// Now look for the first ASCII non-whitespace byte from the end
	stop := len(s)
	for ; stop > start; stop-- {
	c := s[stop-1]
	if c >= utf8.RuneSelf {
	return whitespaceUnicode(s[start:stop])
	}
	if filterChars[c] == 0 {
	break
	}
	}

	return s[start:stop]
	}

	// Policy encapsulates the *bluemonday.Policy that holds
	// the allowlist of HTML elements and attributes that will
	// be applied to the sanitised HTML.
	type Policy struct {
	p *bluemonday.Policy
	}

	// NewDefaultPolicy creates a blank policy with nothing allowed or permitted.
	func NewDefaultPolicy() *Policy {
	return &Policy{
	bluemonday.NewPolicy(),
	}
	}

	// NewTemplatePolicy creates a policy that allows widely used elements to design email templates.
	// ⚠️❗WARNING: We trust CSS from any source.
	func NewTemplatePolicy() *Policy {
	p := bluemonday.NewPolicy()
	p.AllowTables()
	p.AllowStandardAttributes()
	p.AllowStyling()
	p.AllowDataAttributes()
	p.AllowComments()
	p.AllowUnsafe(true)
	p.AllowElements(
	"html", "head", "title", "style", "body",
	"input", "label", "video", "source", "link",
	"span", "img", "div", "ol", "ul", "li", "br",
	)
	p.AllowAttrs(
	"style", "border", "role", "align", "aria-hidden",
	"width", "height", "important", "background", "tabindex",
	"type", "src", "name", "cellspacing", "cellpadding",
	"checked", "for", "rel", "alt",
	).Globally()
	p.AllowAttrs("href").Matching(regexp.MustCompile(`^(https\|mailto):\|#$`)).Globally()
	p.AllowAttrs("target").Matching(regexp.MustCompile(`^_blank$`)).OnElements("a")
	p.AllowAttrs("poster", "preload", "controls").OnElements("video")
	p.AllowAttrs("http-equiv", "content", "charset").OnElements("meta")
	p.AllowAttrs("xmlns", "xmlns:v", "xmlns:o", "xml:lang").OnElements("html")

	return &Policy{
	p,
	}
	}

	// Sanitize takes a string that could contain HTML fragments and applies the given policy allowlist.
	// It returns a struct and its fields have been sanitized by the policy or an empty string
	// if an error has occurred (most likely as a consequence of extremely malformed input).
	func (p *Policy) Sanitize(a any) any {
	t := reflect.TypeOf(a)
	v := reflect.ValueOf(a)
	newStruct := reflect.New(t)

	if t.Kind() == reflect.Struct {
	actualNewStruct := newStruct.Elem()

	for i := 0; i < t.NumField(); i++ {
	// `field` contains the `Name`, `Type` and the `Tag` (Tag = `json:"bla"`)
	field := t.Field(i)
	f := actualNewStruct.FieldByName(field.Name)
	value := v.Field(i)

	switch tmp := value.Interface().(type) {
	case string:
	unescapedHTML := html.UnescapeString(p.sanitize(tmp))
	f.SetString(unescapedHTML)
	case *string:
	if tmp != nil {
	sanitized := p.sanitize(*tmp)
	unescapedHTML := html.UnescapeString(sanitized)
	f.Set(reflect.ValueOf(&unescapedHTML))
	} else {
	f.Set(reflect.Zero(field.Type))
	// f.Set(reflect.ValueOf(tmp))
	}
	case bool:
	f.SetBool(tmp)
	case *bool:
	if tmp != nil {
	f.Set(reflect.ValueOf(tmp))
	} else {
	f.Set(reflect.Zero(field.Type))
	}
	case map[string]string:
	if tmp != nil {
	f.Set(reflect.ValueOf(make(map[string]string, len(tmp))))
	for k, v := range tmp {
	unescapedHTML := html.UnescapeString(p.sanitize(v))
	f.SetMapIndex(reflect.ValueOf(k), reflect.ValueOf(unescapedHTML))
	}
	} else {
	f.Set(reflect.Zero(field.Type))
	}
	}
	}
	}

	return newStruct.Interface()
	}

	// sanitize takes a string and applies the given policy allowlist.
	func (p *Policy) sanitize(s string) string {
	return p.p.Sanitize(Whitespace(s))
	}

	// whitespaceUnicode returns a slice of the string s with all leading
	// and trailing Explicit Formatting Type Unicode code points removed.
	func whitespaceUnicode(s string) string {
	runes := []rune(s)
	start := 0

	// if left side was already trimmed only trim right side
	if !isValidChar(runes[start]) {
	// Look for the first unicode non-explicit-formatting-type
	for ; start < len(runes); start++ {
	c := runes[start]
	if isValidChar(c) {
	break
	}
	}
	}

	// Now look for the first valid unicode non-explicit-formatting-type from the end
	stop := len(runes)
	for ; stop > start; stop-- {
	c := runes[stop-1]
	if isValidChar(c) {
	break
	}
	}

	return string(runes[start:stop])
	}

	// isValidChar checks if a unicode character code point is considered a valid character
	func isValidChar(c rune) bool {
	if unicode.IsLetter(c) \|\|
	unicode.IsPunct(c) \|\|
	unicode.IsSymbol(c) \|\|
	unicode.IsNumber(c) {
	return true
	}
	return false
	}
No results found