Skip to content

Instantly share code, notes, and snippets.

@kentquirk
Created September 17, 2015 21:45
Show Gist options
  • Save kentquirk/2f1e97adf759c0f17169 to your computer and use it in GitHub Desktop.
Save kentquirk/2f1e97adf759c0f17169 to your computer and use it in GitHub Desktop.
SanitizeHTML
// SanitizeHTML strips html tags, replace common entities
func SanitizeHTML(s string) string {
output := ""
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
output = s
} else {
// First remove line breaks etc as these have no meaning outside html tags
s = strings.Replace(s, "\n", "", -1)
s = strings.Replace(s, "\r", "", -1)
// Then replace line breaks with newlines, to preserve that formatting
s = strings.Replace(s, "</p>", "\n", -1)
s = strings.Replace(s, "<br>", "\n", -1)
s = strings.Replace(s, "</br>", "\n", -1)
s = strings.Replace(s, "<br/>", "\n", -1)
// Walk through the string removing all tags
b := bytes.NewBufferString("")
inTag := false
for _, r := range s {
switch r {
case '<':
inTag = true
case '>':
inTag = false
default:
if !inTag {
b.WriteRune(r)
}
}
}
output = b.String()
}
// Remove a few common harmless entities, to arrive at something more like plain text
output = strings.Replace(output, "&#8216;", "'", -1)
output = strings.Replace(output, "&#8217;", "'", -1)
output = strings.Replace(output, "&#8220;", "\"", -1)
output = strings.Replace(output, "&#8221;", "\"", -1)
output = strings.Replace(output, "&nbsp;", " ", -1)
output = strings.Replace(output, "&quot;", "\"", -1)
output = strings.Replace(output, "&apos;", "'", -1)
// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
output = html.UnescapeString(output)
return output
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment