Skip to content

Instantly share code, notes, and snippets.

@ifraixedes
Last active July 19, 2021 14:43
Show Gist options
  • Save ifraixedes/8846ae14cef2ee5ce8ac3e6a78e500cb to your computer and use it in GitHub Desktop.
Save ifraixedes/8846ae14cef2ee5ce8ac3e6a78e500cb to your computer and use it in GitHub Desktop.
Inverse result of the json.HTMLEscape function present in the standard library
package main
// HTMLUnescape replace from src the sequence of bytes \u003c, \u003e, \u0026,
// \u2028, \u2029 by their representative characters <, >, &, U+2028, U+2029.
// Mostly it's does the reverse result of the HTMLEscape function present in
// encoding/json package of the standard library.
//
// The function receives a byte slice for avoiding string allocations during the
// replacement besides it modifies src and returns it.
func HTMLUnescape(src []byte) []byte {
// We don't need to iterate until the end of the src once we have checked the
// 6th character before the end because if it didn't match inside of loop then
// we won't find any escaped character.
for i := 0; i < (len(src) - 5); {
if src[i] != '\\' {
i++
continue
}
// if there aren't more than 5 characters ahead it isn't possible to find
// any of the escaped characters.
if (i + 5) >= len(src) {
break
}
if src[i+1] != 'u' {
i += 2
continue
}
var c byte
if c = src[i+2]; c != '0' && c != '2' {
i += 3
continue
}
if c == '2' {
switch {
case src[i+3] != '0':
i += 3
continue
case src[i+4] != '2':
i += 4
continue
case src[i+5] != '8' && src[i+5] != '9':
i += 5
continue
}
// This is \u2028 or \u2029 which correspond to U+2028 and U+2029 (E2 80
// A8 and E2 80 A9)
c = src[i+5]
src = append(src[:i+3], src[i+6:]...)
src[i] = 0xE2
src[i+1] = 0x80
if c == '8' {
src[i+2] = 0xA8
} else {
src[i+2] = 0xA9
}
i += 3
continue
}
switch {
case src[i+3] != '0':
i += 3
continue
case src[i+4] != '3' && src[i+4] != '2':
i += 5
continue
}
if src[i+4] == '2' {
if src[i+5] != '6' {
i += 6
continue
}
// This is \u0026 which corresponds to '&'
src = append(src[:i+1], src[i+6:]...)
src[i] = '&'
i++
continue
}
if c = src[i+5]; c != 'c' && c != 'e' {
i += 6
continue
}
// This is \u003c or \u003e which corresponds to '<' and '>'
src = append(src[:i+1], src[i+6:]...)
if c == 'c' {
src[i] = '<'
} else {
src[i] = '>'
}
i++
}
return src
}
package main
import (
"fmt"
"testing"
)
func TestHTMLUnescape(t *testing.T) {
tcases := []struct {
src string
exp string
}{
{
src: `1 is \u003c than 5`,
exp: "1 is < than 5",
},
{
src: `10 is \u003e than 5`,
exp: "10 is > than 5",
},
{
src: `black \u0026 white`,
exp: "black & white",
},
{
src: `-- \u2028 --`,
exp: fmt.Sprintf("-- %s --", string([]byte{0xE2, 0x80, 0xA8})),
},
{
src: `Hey: \u2029`,
exp: fmt.Sprintf("Hey: %s", string([]byte{0xE2, 0x80, 0xA9})),
},
{
src: "no escaped characters keep the slice of bytes as it's",
exp: "no escaped characters keep the slice of bytes as it's",
},
{
src: `\u003c\u003e\u0026\u2028\u2029`,
exp: fmt.Sprintf("<>&%s%s", string([]byte{0xE2, 0x80, 0xA8}), string([]byte{0xE2, 0x80, 0xA9})),
},
}
for i, tc := range tcases {
result := HTMLUnescape([]byte(tc.src))
if tc.exp != string(result) {
t.Errorf("unexpected result for test case %d; want=%q, got=%q", i, tc.exp, string(result))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment