Created
October 12, 2017 10:14
-
-
Save akerenyi/659097f78c56cb82ad16af73e2c23773 to your computer and use it in GitHub Desktop.
Swift 4 version of Job van der Voort's "Decode HTML entities in swift" (https://gitlab.com/snippets/32429)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private let characterEntities : [String: Character] = [ | |
// XML predefined entities: | |
""" : "\"", | |
"&" : "&", | |
"'" : "'", | |
"<" : "<", | |
">" : ">", | |
// HTML character entity references: | |
" " : "\u{00A0}", | |
"¡" : "\u{00A1}", | |
"¢" : "\u{00A2}", | |
"£" : "\u{00A3}", | |
"¤" : "\u{00A4}", | |
"¥" : "\u{00A5}", | |
"¦" : "\u{00A6}", | |
"§" : "\u{00A7}", | |
"¨" : "\u{00A8}", | |
"©" : "\u{00A9}", | |
"ª" : "\u{00AA}", | |
"«" : "\u{00AB}", | |
"¬" : "\u{00AC}", | |
"­" : "\u{00AD}", | |
"®" : "\u{00AE}", | |
"¯" : "\u{00AF}", | |
"°" : "\u{00B0}", | |
"±" : "\u{00B1}", | |
"²" : "\u{00B2}", | |
"³" : "\u{00B3}", | |
"´" : "\u{00B4}", | |
"µ" : "\u{00B5}", | |
"¶" : "\u{00B6}", | |
"·" : "\u{00B7}", | |
"¸" : "\u{00B8}", | |
"¹" : "\u{00B9}", | |
"º" : "\u{00BA}", | |
"»" : "\u{00BB}", | |
"¼" : "\u{00BC}", | |
"½" : "\u{00BD}", | |
"¾" : "\u{00BE}", | |
"¿" : "\u{00BF}", | |
"À" : "\u{00C0}", | |
"Á" : "\u{00C1}", | |
"Â" : "\u{00C2}", | |
"Ã" : "\u{00C3}", | |
"Ä" : "\u{00C4}", | |
"Å" : "\u{00C5}", | |
"Æ" : "\u{00C6}", | |
"Ç" : "\u{00C7}", | |
"È" : "\u{00C8}", | |
"É" : "\u{00C9}", | |
"Ê" : "\u{00CA}", | |
"Ë" : "\u{00CB}", | |
"Ì" : "\u{00CC}", | |
"Í" : "\u{00CD}", | |
"Î" : "\u{00CE}", | |
"Ï" : "\u{00CF}", | |
"Ð" : "\u{00D0}", | |
"Ñ" : "\u{00D1}", | |
"Ò" : "\u{00D2}", | |
"Ó" : "\u{00D3}", | |
"Ô" : "\u{00D4}", | |
"Õ" : "\u{00D5}", | |
"Ö" : "\u{00D6}", | |
"×" : "\u{00D7}", | |
"Ø" : "\u{00D8}", | |
"Ù" : "\u{00D9}", | |
"Ú" : "\u{00DA}", | |
"Û" : "\u{00DB}", | |
"Ü" : "\u{00DC}", | |
"Ý" : "\u{00DD}", | |
"Þ" : "\u{00DE}", | |
"ß" : "\u{00DF}", | |
"à" : "\u{00E0}", | |
"á" : "\u{00E1}", | |
"â" : "\u{00E2}", | |
"ã" : "\u{00E3}", | |
"ä" : "\u{00E4}", | |
"å" : "\u{00E5}", | |
"æ" : "\u{00E6}", | |
"ç" : "\u{00E7}", | |
"è" : "\u{00E8}", | |
"é" : "\u{00E9}", | |
"ê" : "\u{00EA}", | |
"ë" : "\u{00EB}", | |
"ì" : "\u{00EC}", | |
"í" : "\u{00ED}", | |
"î" : "\u{00EE}", | |
"ï" : "\u{00EF}", | |
"ð" : "\u{00F0}", | |
"ñ" : "\u{00F1}", | |
"ò" : "\u{00F2}", | |
"ó" : "\u{00F3}", | |
"ô" : "\u{00F4}", | |
"õ" : "\u{00F5}", | |
"ö" : "\u{00F6}", | |
"÷" : "\u{00F7}", | |
"ø" : "\u{00F8}", | |
"ù" : "\u{00F9}", | |
"ú" : "\u{00FA}", | |
"û" : "\u{00FB}", | |
"ü" : "\u{00FC}", | |
"ý" : "\u{00FD}", | |
"þ" : "\u{00FE}", | |
"ÿ" : "\u{00FF}", | |
"Œ" : "\u{0152}", | |
"œ" : "\u{0153}", | |
"Š" : "\u{0160}", | |
"š" : "\u{0161}", | |
"Ÿ" : "\u{0178}", | |
"ƒ" : "\u{0192}", | |
"ˆ" : "\u{02C6}", | |
"˜" : "\u{02DC}", | |
"Α" : "\u{0391}", | |
"Β" : "\u{0392}", | |
"Γ" : "\u{0393}", | |
"Δ" : "\u{0394}", | |
"Ε" : "\u{0395}", | |
"Ζ" : "\u{0396}", | |
"Η" : "\u{0397}", | |
"Θ" : "\u{0398}", | |
"Ι" : "\u{0399}", | |
"Κ" : "\u{039A}", | |
"Λ" : "\u{039B}", | |
"Μ" : "\u{039C}", | |
"Ν" : "\u{039D}", | |
"Ξ" : "\u{039E}", | |
"Ο" : "\u{039F}", | |
"Π" : "\u{03A0}", | |
"Ρ" : "\u{03A1}", | |
"Σ" : "\u{03A3}", | |
"Τ" : "\u{03A4}", | |
"Υ" : "\u{03A5}", | |
"Φ" : "\u{03A6}", | |
"Χ" : "\u{03A7}", | |
"Ψ" : "\u{03A8}", | |
"Ω" : "\u{03A9}", | |
"α" : "\u{03B1}", | |
"β" : "\u{03B2}", | |
"γ" : "\u{03B3}", | |
"δ" : "\u{03B4}", | |
"ε" : "\u{03B5}", | |
"ζ" : "\u{03B6}", | |
"η" : "\u{03B7}", | |
"θ" : "\u{03B8}", | |
"ι" : "\u{03B9}", | |
"κ" : "\u{03BA}", | |
"λ" : "\u{03BB}", | |
"μ" : "\u{03BC}", | |
"ν" : "\u{03BD}", | |
"ξ" : "\u{03BE}", | |
"ο" : "\u{03BF}", | |
"π" : "\u{03C0}", | |
"ρ" : "\u{03C1}", | |
"ς" : "\u{03C2}", | |
"σ" : "\u{03C3}", | |
"τ" : "\u{03C4}", | |
"υ" : "\u{03C5}", | |
"φ" : "\u{03C6}", | |
"χ" : "\u{03C7}", | |
"ψ" : "\u{03C8}", | |
"ω" : "\u{03C9}", | |
"ϑ" : "\u{03D1}", | |
"ϒ" : "\u{03D2}", | |
"ϖ" : "\u{03D6}", | |
" " : "\u{2002}", | |
" " : "\u{2003}", | |
" " : "\u{2009}", | |
"‌" : "\u{200C}", | |
"‍" : "\u{200D}", | |
"‎" : "\u{200E}", | |
"‏" : "\u{200F}", | |
"–" : "\u{2013}", | |
"—" : "\u{2014}", | |
"‘" : "\u{2018}", | |
"’" : "\u{2019}", | |
"‚" : "\u{201A}", | |
"“" : "\u{201C}", | |
"”" : "\u{201D}", | |
"„" : "\u{201E}", | |
"†" : "\u{2020}", | |
"‡" : "\u{2021}", | |
"•" : "\u{2022}", | |
"…" : "\u{2026}", | |
"‰" : "\u{2030}", | |
"′" : "\u{2032}", | |
"″" : "\u{2033}", | |
"‹" : "\u{2039}", | |
"›" : "\u{203A}", | |
"‾" : "\u{203E}", | |
"⁄" : "\u{2044}", | |
"€" : "\u{20AC}", | |
"ℑ" : "\u{2111}", | |
"℘" : "\u{2118}", | |
"ℜ" : "\u{211C}", | |
"™" : "\u{2122}", | |
"ℵ" : "\u{2135}", | |
"←" : "\u{2190}", | |
"↑" : "\u{2191}", | |
"→" : "\u{2192}", | |
"↓" : "\u{2193}", | |
"↔" : "\u{2194}", | |
"↵" : "\u{21B5}", | |
"⇐" : "\u{21D0}", | |
"⇑" : "\u{21D1}", | |
"⇒" : "\u{21D2}", | |
"⇓" : "\u{21D3}", | |
"⇔" : "\u{21D4}", | |
"∀" : "\u{2200}", | |
"∂" : "\u{2202}", | |
"∃" : "\u{2203}", | |
"∅" : "\u{2205}", | |
"∇" : "\u{2207}", | |
"∈" : "\u{2208}", | |
"∉" : "\u{2209}", | |
"∋" : "\u{220B}", | |
"∏" : "\u{220F}", | |
"∑" : "\u{2211}", | |
"−" : "\u{2212}", | |
"∗" : "\u{2217}", | |
"√" : "\u{221A}", | |
"∝" : "\u{221D}", | |
"∞" : "\u{221E}", | |
"∠" : "\u{2220}", | |
"∧" : "\u{2227}", | |
"∨" : "\u{2228}", | |
"∩" : "\u{2229}", | |
"∪" : "\u{222A}", | |
"∫" : "\u{222B}", | |
"∴" : "\u{2234}", | |
"∼" : "\u{223C}", | |
"≅" : "\u{2245}", | |
"≈" : "\u{2248}", | |
"≠" : "\u{2260}", | |
"≡" : "\u{2261}", | |
"≤" : "\u{2264}", | |
"≥" : "\u{2265}", | |
"⊂" : "\u{2282}", | |
"⊃" : "\u{2283}", | |
"⊄" : "\u{2284}", | |
"⊆" : "\u{2286}", | |
"⊇" : "\u{2287}", | |
"⊕" : "\u{2295}", | |
"⊗" : "\u{2297}", | |
"⊥" : "\u{22A5}", | |
"⋅" : "\u{22C5}", | |
"⌈" : "\u{2308}", | |
"⌉" : "\u{2309}", | |
"⌊" : "\u{230A}", | |
"⌋" : "\u{230B}", | |
"⟨" : "\u{2329}", | |
"⟩" : "\u{232A}", | |
"◊" : "\u{25CA}", | |
"♠" : "\u{2660}", | |
"♣" : "\u{2663}", | |
"♥" : "\u{2665}", | |
"♦" : "\u{2666}", | |
] | |
extension String { | |
/// Returns a new string made by replacing in the `String` | |
/// all HTML character entity references with the corresponding | |
/// character. | |
var stringByDecodingHTMLEntities : String { | |
// ===== Utility functions ===== | |
// Convert the number in the string to the corresponding | |
// Unicode character, e.g. | |
// decodeNumeric("64", 10) --> "@" | |
// decodeNumeric("20ac", 16) --> "€" | |
func decodeNumeric(_ string : String, base : Int) -> Character? { | |
guard let code = UInt32(string, radix: base), | |
let uniScalar = UnicodeScalar(code) else { return nil } | |
return Character(uniScalar) | |
} | |
// Decode the HTML character entity to the corresponding | |
// Unicode character, return `nil` for invalid input. | |
// decode("@") --> "@" | |
// decode("€") --> "€" | |
// decode("<") --> "<" | |
// decode("&foo;") --> nil | |
func decode(_ entity : String) -> Character? { | |
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){ | |
let startIndex = entity.index(entity.startIndex, offsetBy: 3) | |
let endIndex = entity.index(entity.endIndex, offsetBy: -1) | |
return decodeNumeric(String(entity[startIndex ..< endIndex]), base: 16) | |
} else if entity.hasPrefix("&#") { | |
let startIndex = entity.index(entity.startIndex, offsetBy: 2) | |
let endIndex = entity.index(entity.endIndex, offsetBy: -1) | |
return decodeNumeric(String(entity[startIndex ..< endIndex]), base: 10) | |
} else { | |
return characterEntities[entity] | |
} | |
} | |
// ===== Method starts here ===== | |
var result = "" | |
var position = startIndex | |
// Find the next '&' and copy the characters preceding it to `result`: | |
while let ampRange = self.range(of: "&", range: position ..< endIndex) { | |
result.append(String(self[position ..< ampRange.lowerBound])) | |
position = ampRange.lowerBound | |
// Find the next ';' and copy everything from '&' to ';' into `entity` | |
if let semiRange = self.range(of: ";", range: position ..< endIndex) { | |
let entity = String(self[position ..< semiRange.upperBound]) | |
position = semiRange.upperBound | |
if let decoded = decode(entity) { | |
// Replace by decoded character: | |
result.append(decoded) | |
} else { | |
// Invalid entity, copy verbatim: | |
result.append(entity) | |
} | |
} else { | |
// No matching ';'. | |
break | |
} | |
} | |
// Copy remaining characters to `result`: | |
result.append(String(self[position ..< endIndex])) | |
return result | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment