Last active
May 27, 2024 04:36
-
-
Save mwaterfall/25b4a6a06dc3309d9555 to your computer and use it in GitHub Desktop.
Decoding HTML Entities in Swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Very slightly adapted from http://stackoverflow.com/a/30141700/106244 | |
// 99.99% Credit to Martin R! | |
// Mapping from XML/HTML character entity reference to character | |
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references | |
private let characterEntities : [String: Character] = [ | |
// XML predefined entities: | |
""" : "\"", | |
"&" : "&", | |
"'" : "'", | |
"<" : "<", | |
">" : ">", | |
// HTML character entity references: | |
" " : "\u{00A0}", | |
"¡" : "\u{00A1}", | |
"¢" : "\u{00A2}", | |
"£" : "\u{00A3}", | |
"¤" : "\u{00A4}", | |
"¥" : "\u{00A5}", | |
"¦" : "\u{00A6}", | |
"§" : "\u{00A7}", | |
"¨" : "\u{00A8}", | |
"©" : "\u{00A9}", | |
"ª" : "\u{00AA}", | |
"«" : "\u{00AB}", | |
"¬" : "\u{00AC}", | |
"­" : "\u{00AD}", | |
"®" : "\u{00AE}", | |
"¯" : "\u{00AF}", | |
"°" : "\u{00B0}", | |
"±" : "\u{00B1}", | |
"²" : "\u{00B2}", | |
"³" : "\u{00B3}", | |
"´" : "\u{00B4}", | |
"µ" : "\u{00B5}", | |
"¶" : "\u{00B6}", | |
"·" : "\u{00B7}", | |
"¸" : "\u{00B8}", | |
"¹" : "\u{00B9}", | |
"º" : "\u{00BA}", | |
"»" : "\u{00BB}", | |
"¼" : "\u{00BC}", | |
"½" : "\u{00BD}", | |
"¾" : "\u{00BE}", | |
"¿" : "\u{00BF}", | |
"À" : "\u{00C0}", | |
"Á" : "\u{00C1}", | |
"Â" : "\u{00C2}", | |
"Ã" : "\u{00C3}", | |
"Ä" : "\u{00C4}", | |
"Å" : "\u{00C5}", | |
"Æ" : "\u{00C6}", | |
"Ç" : "\u{00C7}", | |
"È" : "\u{00C8}", | |
"É" : "\u{00C9}", | |
"Ê" : "\u{00CA}", | |
"Ë" : "\u{00CB}", | |
"Ì" : "\u{00CC}", | |
"Í" : "\u{00CD}", | |
"Î" : "\u{00CE}", | |
"Ï" : "\u{00CF}", | |
"Ð" : "\u{00D0}", | |
"Ñ" : "\u{00D1}", | |
"Ò" : "\u{00D2}", | |
"Ó" : "\u{00D3}", | |
"Ô" : "\u{00D4}", | |
"Õ" : "\u{00D5}", | |
"Ö" : "\u{00D6}", | |
"×" : "\u{00D7}", | |
"Ø" : "\u{00D8}", | |
"Ù" : "\u{00D9}", | |
"Ú" : "\u{00DA}", | |
"Û" : "\u{00DB}", | |
"Ü" : "\u{00DC}", | |
"Ý" : "\u{00DD}", | |
"Þ" : "\u{00DE}", | |
"ß" : "\u{00DF}", | |
"à" : "\u{00E0}", | |
"á" : "\u{00E1}", | |
"â" : "\u{00E2}", | |
"ã" : "\u{00E3}", | |
"ä" : "\u{00E4}", | |
"å" : "\u{00E5}", | |
"æ" : "\u{00E6}", | |
"ç" : "\u{00E7}", | |
"è" : "\u{00E8}", | |
"é" : "\u{00E9}", | |
"ê" : "\u{00EA}", | |
"ë" : "\u{00EB}", | |
"ì" : "\u{00EC}", | |
"í" : "\u{00ED}", | |
"î" : "\u{00EE}", | |
"ï" : "\u{00EF}", | |
"ð" : "\u{00F0}", | |
"ñ" : "\u{00F1}", | |
"ò" : "\u{00F2}", | |
"ó" : "\u{00F3}", | |
"ô" : "\u{00F4}", | |
"õ" : "\u{00F5}", | |
"ö" : "\u{00F6}", | |
"÷" : "\u{00F7}", | |
"ø" : "\u{00F8}", | |
"ù" : "\u{00F9}", | |
"ú" : "\u{00FA}", | |
"û" : "\u{00FB}", | |
"ü" : "\u{00FC}", | |
"ý" : "\u{00FD}", | |
"þ" : "\u{00FE}", | |
"ÿ" : "\u{00FF}", | |
"Œ" : "\u{0152}", | |
"œ" : "\u{0153}", | |
"Š" : "\u{0160}", | |
"š" : "\u{0161}", | |
"Ÿ" : "\u{0178}", | |
"ƒ" : "\u{0192}", | |
"ˆ" : "\u{02C6}", | |
"˜" : "\u{02DC}", | |
"Α" : "\u{0391}", | |
"Β" : "\u{0392}", | |
"Γ" : "\u{0393}", | |
"Δ" : "\u{0394}", | |
"Ε" : "\u{0395}", | |
"Ζ" : "\u{0396}", | |
"Η" : "\u{0397}", | |
"Θ" : "\u{0398}", | |
"Ι" : "\u{0399}", | |
"Κ" : "\u{039A}", | |
"Λ" : "\u{039B}", | |
"Μ" : "\u{039C}", | |
"Ν" : "\u{039D}", | |
"Ξ" : "\u{039E}", | |
"Ο" : "\u{039F}", | |
"Π" : "\u{03A0}", | |
"Ρ" : "\u{03A1}", | |
"Σ" : "\u{03A3}", | |
"Τ" : "\u{03A4}", | |
"Υ" : "\u{03A5}", | |
"Φ" : "\u{03A6}", | |
"Χ" : "\u{03A7}", | |
"Ψ" : "\u{03A8}", | |
"Ω" : "\u{03A9}", | |
"α" : "\u{03B1}", | |
"β" : "\u{03B2}", | |
"γ" : "\u{03B3}", | |
"δ" : "\u{03B4}", | |
"ε" : "\u{03B5}", | |
"ζ" : "\u{03B6}", | |
"η" : "\u{03B7}", | |
"θ" : "\u{03B8}", | |
"ι" : "\u{03B9}", | |
"κ" : "\u{03BA}", | |
"λ" : "\u{03BB}", | |
"μ" : "\u{03BC}", | |
"ν" : "\u{03BD}", | |
"ξ" : "\u{03BE}", | |
"ο" : "\u{03BF}", | |
"π" : "\u{03C0}", | |
"ρ" : "\u{03C1}", | |
"ς" : "\u{03C2}", | |
"σ" : "\u{03C3}", | |
"τ" : "\u{03C4}", | |
"υ" : "\u{03C5}", | |
"φ" : "\u{03C6}", | |
"χ" : "\u{03C7}", | |
"ψ" : "\u{03C8}", | |
"ω" : "\u{03C9}", | |
"ϑ" : "\u{03D1}", | |
"ϒ" : "\u{03D2}", | |
"ϖ" : "\u{03D6}", | |
" " : "\u{2002}", | |
" " : "\u{2003}", | |
" " : "\u{2009}", | |
"‌" : "\u{200C}", | |
"‍" : "\u{200D}", | |
"‎" : "\u{200E}", | |
"‏" : "\u{200F}", | |
"–" : "\u{2013}", | |
"—" : "\u{2014}", | |
"‘" : "\u{2018}", | |
"’" : "\u{2019}", | |
"‚" : "\u{201A}", | |
"“" : "\u{201C}", | |
"”" : "\u{201D}", | |
"„" : "\u{201E}", | |
"†" : "\u{2020}", | |
"‡" : "\u{2021}", | |
"•" : "\u{2022}", | |
"…" : "\u{2026}", | |
"‰" : "\u{2030}", | |
"′" : "\u{2032}", | |
"″" : "\u{2033}", | |
"‹" : "\u{2039}", | |
"›" : "\u{203A}", | |
"‾" : "\u{203E}", | |
"⁄" : "\u{2044}", | |
"€" : "\u{20AC}", | |
"ℑ" : "\u{2111}", | |
"℘" : "\u{2118}", | |
"ℜ" : "\u{211C}", | |
"™" : "\u{2122}", | |
"ℵ" : "\u{2135}", | |
"←" : "\u{2190}", | |
"↑" : "\u{2191}", | |
"→" : "\u{2192}", | |
"↓" : "\u{2193}", | |
"↔" : "\u{2194}", | |
"↵" : "\u{21B5}", | |
"⇐" : "\u{21D0}", | |
"⇑" : "\u{21D1}", | |
"⇒" : "\u{21D2}", | |
"⇓" : "\u{21D3}", | |
"⇔" : "\u{21D4}", | |
"∀" : "\u{2200}", | |
"∂" : "\u{2202}", | |
"∃" : "\u{2203}", | |
"∅" : "\u{2205}", | |
"∇" : "\u{2207}", | |
"∈" : "\u{2208}", | |
"∉" : "\u{2209}", | |
"∋" : "\u{220B}", | |
"∏" : "\u{220F}", | |
"∑" : "\u{2211}", | |
"−" : "\u{2212}", | |
"∗" : "\u{2217}", | |
"√" : "\u{221A}", | |
"∝" : "\u{221D}", | |
"∞" : "\u{221E}", | |
"∠" : "\u{2220}", | |
"∧" : "\u{2227}", | |
"∨" : "\u{2228}", | |
"∩" : "\u{2229}", | |
"∪" : "\u{222A}", | |
"∫" : "\u{222B}", | |
"∴" : "\u{2234}", | |
"∼" : "\u{223C}", | |
"≅" : "\u{2245}", | |
"≈" : "\u{2248}", | |
"≠" : "\u{2260}", | |
"≡" : "\u{2261}", | |
"≤" : "\u{2264}", | |
"≥" : "\u{2265}", | |
"⊂" : "\u{2282}", | |
"⊃" : "\u{2283}", | |
"⊄" : "\u{2284}", | |
"⊆" : "\u{2286}", | |
"⊇" : "\u{2287}", | |
"⊕" : "\u{2295}", | |
"⊗" : "\u{2297}", | |
"⊥" : "\u{22A5}", | |
"⋅" : "\u{22C5}", | |
"⌈" : "\u{2308}", | |
"⌉" : "\u{2309}", | |
"⌊" : "\u{230A}", | |
"⌋" : "\u{230B}", | |
"⟨" : "\u{2329}", | |
"⟩" : "\u{232A}", | |
"◊" : "\u{25CA}", | |
"♠" : "\u{2660}", | |
"♣" : "\u{2663}", | |
"♥" : "\u{2665}", | |
"♦" : "\u{2666}", | |
] | |
extension String { | |
/// Returns a new string made by replacing in the `String` | |
/// all HTML character entity references with the corresponding | |
/// character. | |
var stringByDecodingHTMLEntities: String { | |
return decodeHTMLEntities().decodedString | |
} | |
/// Returns a tuple containing the string made by relpacing in the | |
/// `String` all HTML character entity references with the corresponding | |
/// character. Also returned is an array of offset information describing | |
/// the location and length offsets for each replacement. This allows | |
/// for the correct adjust any attributes that may be associated with | |
/// with substrings within the `String` | |
func decodeHTMLEntities() -> (decodedString: String, replacementOffsets: [(index: String.Index, offset: String.Index.Distance)]) { | |
// ===== Utility functions ===== | |
// Record the index offsets of each replacement | |
// This allows anyone to correctly adjust any attributes that may be | |
// associated with substrings within the string | |
var replacementOffsets: [(index: String.Index, offset: String.Index.Distance)] = [] | |
// Convert the number in the string to the corresponding | |
// Unicode character, e.g. | |
// decodeNumeric("64", 10) --> "@" | |
// decodeNumeric("20ac", 16) --> "€" | |
func decodeNumeric(string : String, base : Int32) -> Character? { | |
let code = UInt32(strtoul(string, nil, base)) | |
return Character(UnicodeScalar(code)) | |
} | |
// Decode the HTML character entity to the corresponding | |
// Unicode character, return `nil` for invalid input. | |
// decode("@") --> "@" | |
// decode("€") --> "€" | |
// decode("<") --> "<" | |
// decode("&foo;") --> nil | |
func decode(entity : String) -> Character? { | |
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){ | |
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 3)), 16) | |
} else if entity.hasPrefix("&#") { | |
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 2)), 10) | |
} else { | |
return characterEntities[entity] | |
} | |
} | |
// ===== Method starts here ===== | |
var result = "" | |
var position = startIndex | |
// Find the next '&' and copy the characters preceding it to `result`: | |
while let ampRange = self.rangeOfString("&", range: position ..< endIndex) { | |
result.extend(self[position ..< ampRange.startIndex]) | |
position = ampRange.startIndex | |
// Find the next ';' and copy everything from '&' to ';' into `entity` | |
if let semiRange = self.rangeOfString(";", range: position ..< endIndex) { | |
let entity = self[position ..< semiRange.endIndex] | |
if let decoded = decode(entity) { | |
// Replace by decoded character: | |
result.append(decoded) | |
// Record offset | |
let offset = (index: semiRange.endIndex, offset: 1 - distance(position, semiRange.endIndex)) | |
replacementOffsets.append(offset) | |
} else { | |
// Invalid entity, copy verbatim: | |
result.extend(entity) | |
} | |
position = semiRange.endIndex | |
} else { | |
// No matching ';'. | |
break | |
} | |
} | |
// Copy remaining characters to `result`: | |
result.extend(self[position ..< endIndex]) | |
// Return results | |
return (decodedString: result, replacementOffsets: replacementOffsets) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Swift 5.7+
With the Regex Literal support introduced in Swift 5.7, this process has gotten much simpler:
Warning
I haven't really tested all the edge cases in the above snippet and I typed it on GitHub, so it may require tweaking. Still, it's way more streamlined than the older approach and the performance is phenomenal.