Last active
May 27, 2024 04:36
-
-
Save mwaterfall/25b4a6a06dc3309d9555 to your computer and use it in GitHub Desktop.
Decoding HTML Entities in Swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Very slightly adapted from http://stackoverflow.com/a/30141700/106244 | |
// 99.99% Credit to Martin R! | |
// Mapping from XML/HTML character entity reference to character | |
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references | |
private let characterEntities : [String: Character] = [ | |
// XML predefined entities: | |
""" : "\"", | |
"&" : "&", | |
"'" : "'", | |
"<" : "<", | |
">" : ">", | |
// HTML character entity references: | |
" " : "\u{00A0}", | |
"¡" : "\u{00A1}", | |
"¢" : "\u{00A2}", | |
"£" : "\u{00A3}", | |
"¤" : "\u{00A4}", | |
"¥" : "\u{00A5}", | |
"¦" : "\u{00A6}", | |
"§" : "\u{00A7}", | |
"¨" : "\u{00A8}", | |
"©" : "\u{00A9}", | |
"ª" : "\u{00AA}", | |
"«" : "\u{00AB}", | |
"¬" : "\u{00AC}", | |
"­" : "\u{00AD}", | |
"®" : "\u{00AE}", | |
"¯" : "\u{00AF}", | |
"°" : "\u{00B0}", | |
"±" : "\u{00B1}", | |
"²" : "\u{00B2}", | |
"³" : "\u{00B3}", | |
"´" : "\u{00B4}", | |
"µ" : "\u{00B5}", | |
"¶" : "\u{00B6}", | |
"·" : "\u{00B7}", | |
"¸" : "\u{00B8}", | |
"¹" : "\u{00B9}", | |
"º" : "\u{00BA}", | |
"»" : "\u{00BB}", | |
"¼" : "\u{00BC}", | |
"½" : "\u{00BD}", | |
"¾" : "\u{00BE}", | |
"¿" : "\u{00BF}", | |
"À" : "\u{00C0}", | |
"Á" : "\u{00C1}", | |
"Â" : "\u{00C2}", | |
"Ã" : "\u{00C3}", | |
"Ä" : "\u{00C4}", | |
"Å" : "\u{00C5}", | |
"Æ" : "\u{00C6}", | |
"Ç" : "\u{00C7}", | |
"È" : "\u{00C8}", | |
"É" : "\u{00C9}", | |
"Ê" : "\u{00CA}", | |
"Ë" : "\u{00CB}", | |
"Ì" : "\u{00CC}", | |
"Í" : "\u{00CD}", | |
"Î" : "\u{00CE}", | |
"Ï" : "\u{00CF}", | |
"Ð" : "\u{00D0}", | |
"Ñ" : "\u{00D1}", | |
"Ò" : "\u{00D2}", | |
"Ó" : "\u{00D3}", | |
"Ô" : "\u{00D4}", | |
"Õ" : "\u{00D5}", | |
"Ö" : "\u{00D6}", | |
"×" : "\u{00D7}", | |
"Ø" : "\u{00D8}", | |
"Ù" : "\u{00D9}", | |
"Ú" : "\u{00DA}", | |
"Û" : "\u{00DB}", | |
"Ü" : "\u{00DC}", | |
"Ý" : "\u{00DD}", | |
"Þ" : "\u{00DE}", | |
"ß" : "\u{00DF}", | |
"à" : "\u{00E0}", | |
"á" : "\u{00E1}", | |
"â" : "\u{00E2}", | |
"ã" : "\u{00E3}", | |
"ä" : "\u{00E4}", | |
"å" : "\u{00E5}", | |
"æ" : "\u{00E6}", | |
"ç" : "\u{00E7}", | |
"è" : "\u{00E8}", | |
"é" : "\u{00E9}", | |
"ê" : "\u{00EA}", | |
"ë" : "\u{00EB}", | |
"ì" : "\u{00EC}", | |
"í" : "\u{00ED}", | |
"î" : "\u{00EE}", | |
"ï" : "\u{00EF}", | |
"ð" : "\u{00F0}", | |
"ñ" : "\u{00F1}", | |
"ò" : "\u{00F2}", | |
"ó" : "\u{00F3}", | |
"ô" : "\u{00F4}", | |
"õ" : "\u{00F5}", | |
"ö" : "\u{00F6}", | |
"÷" : "\u{00F7}", | |
"ø" : "\u{00F8}", | |
"ù" : "\u{00F9}", | |
"ú" : "\u{00FA}", | |
"û" : "\u{00FB}", | |
"ü" : "\u{00FC}", | |
"ý" : "\u{00FD}", | |
"þ" : "\u{00FE}", | |
"ÿ" : "\u{00FF}", | |
"Œ" : "\u{0152}", | |
"œ" : "\u{0153}", | |
"Š" : "\u{0160}", | |
"š" : "\u{0161}", | |
"Ÿ" : "\u{0178}", | |
"ƒ" : "\u{0192}", | |
"ˆ" : "\u{02C6}", | |
"˜" : "\u{02DC}", | |
"Α" : "\u{0391}", | |
"Β" : "\u{0392}", | |
"Γ" : "\u{0393}", | |
"Δ" : "\u{0394}", | |
"Ε" : "\u{0395}", | |
"Ζ" : "\u{0396}", | |
"Η" : "\u{0397}", | |
"Θ" : "\u{0398}", | |
"Ι" : "\u{0399}", | |
"Κ" : "\u{039A}", | |
"Λ" : "\u{039B}", | |
"Μ" : "\u{039C}", | |
"Ν" : "\u{039D}", | |
"Ξ" : "\u{039E}", | |
"Ο" : "\u{039F}", | |
"Π" : "\u{03A0}", | |
"Ρ" : "\u{03A1}", | |
"Σ" : "\u{03A3}", | |
"Τ" : "\u{03A4}", | |
"Υ" : "\u{03A5}", | |
"Φ" : "\u{03A6}", | |
"Χ" : "\u{03A7}", | |
"Ψ" : "\u{03A8}", | |
"Ω" : "\u{03A9}", | |
"α" : "\u{03B1}", | |
"β" : "\u{03B2}", | |
"γ" : "\u{03B3}", | |
"δ" : "\u{03B4}", | |
"ε" : "\u{03B5}", | |
"ζ" : "\u{03B6}", | |
"η" : "\u{03B7}", | |
"θ" : "\u{03B8}", | |
"ι" : "\u{03B9}", | |
"κ" : "\u{03BA}", | |
"λ" : "\u{03BB}", | |
"μ" : "\u{03BC}", | |
"ν" : "\u{03BD}", | |
"ξ" : "\u{03BE}", | |
"ο" : "\u{03BF}", | |
"π" : "\u{03C0}", | |
"ρ" : "\u{03C1}", | |
"ς" : "\u{03C2}", | |
"σ" : "\u{03C3}", | |
"τ" : "\u{03C4}", | |
"υ" : "\u{03C5}", | |
"φ" : "\u{03C6}", | |
"χ" : "\u{03C7}", | |
"ψ" : "\u{03C8}", | |
"ω" : "\u{03C9}", | |
"ϑ" : "\u{03D1}", | |
"ϒ" : "\u{03D2}", | |
"ϖ" : "\u{03D6}", | |
" " : "\u{2002}", | |
" " : "\u{2003}", | |
" " : "\u{2009}", | |
"‌" : "\u{200C}", | |
"‍" : "\u{200D}", | |
"‎" : "\u{200E}", | |
"‏" : "\u{200F}", | |
"–" : "\u{2013}", | |
"—" : "\u{2014}", | |
"‘" : "\u{2018}", | |
"’" : "\u{2019}", | |
"‚" : "\u{201A}", | |
"“" : "\u{201C}", | |
"”" : "\u{201D}", | |
"„" : "\u{201E}", | |
"†" : "\u{2020}", | |
"‡" : "\u{2021}", | |
"•" : "\u{2022}", | |
"…" : "\u{2026}", | |
"‰" : "\u{2030}", | |
"′" : "\u{2032}", | |
"″" : "\u{2033}", | |
"‹" : "\u{2039}", | |
"›" : "\u{203A}", | |
"‾" : "\u{203E}", | |
"⁄" : "\u{2044}", | |
"€" : "\u{20AC}", | |
"ℑ" : "\u{2111}", | |
"℘" : "\u{2118}", | |
"ℜ" : "\u{211C}", | |
"™" : "\u{2122}", | |
"ℵ" : "\u{2135}", | |
"←" : "\u{2190}", | |
"↑" : "\u{2191}", | |
"→" : "\u{2192}", | |
"↓" : "\u{2193}", | |
"↔" : "\u{2194}", | |
"↵" : "\u{21B5}", | |
"⇐" : "\u{21D0}", | |
"⇑" : "\u{21D1}", | |
"⇒" : "\u{21D2}", | |
"⇓" : "\u{21D3}", | |
"⇔" : "\u{21D4}", | |
"∀" : "\u{2200}", | |
"∂" : "\u{2202}", | |
"∃" : "\u{2203}", | |
"∅" : "\u{2205}", | |
"∇" : "\u{2207}", | |
"∈" : "\u{2208}", | |
"∉" : "\u{2209}", | |
"∋" : "\u{220B}", | |
"∏" : "\u{220F}", | |
"∑" : "\u{2211}", | |
"−" : "\u{2212}", | |
"∗" : "\u{2217}", | |
"√" : "\u{221A}", | |
"∝" : "\u{221D}", | |
"∞" : "\u{221E}", | |
"∠" : "\u{2220}", | |
"∧" : "\u{2227}", | |
"∨" : "\u{2228}", | |
"∩" : "\u{2229}", | |
"∪" : "\u{222A}", | |
"∫" : "\u{222B}", | |
"∴" : "\u{2234}", | |
"∼" : "\u{223C}", | |
"≅" : "\u{2245}", | |
"≈" : "\u{2248}", | |
"≠" : "\u{2260}", | |
"≡" : "\u{2261}", | |
"≤" : "\u{2264}", | |
"≥" : "\u{2265}", | |
"⊂" : "\u{2282}", | |
"⊃" : "\u{2283}", | |
"⊄" : "\u{2284}", | |
"⊆" : "\u{2286}", | |
"⊇" : "\u{2287}", | |
"⊕" : "\u{2295}", | |
"⊗" : "\u{2297}", | |
"⊥" : "\u{22A5}", | |
"⋅" : "\u{22C5}", | |
"⌈" : "\u{2308}", | |
"⌉" : "\u{2309}", | |
"⌊" : "\u{230A}", | |
"⌋" : "\u{230B}", | |
"⟨" : "\u{2329}", | |
"⟩" : "\u{232A}", | |
"◊" : "\u{25CA}", | |
"♠" : "\u{2660}", | |
"♣" : "\u{2663}", | |
"♥" : "\u{2665}", | |
"♦" : "\u{2666}", | |
] | |
extension String { | |
/// Returns a new string made by replacing in the `String` | |
/// all HTML character entity references with the corresponding | |
/// character. | |
var stringByDecodingHTMLEntities: String { | |
return decodeHTMLEntities().decodedString | |
} | |
/// Returns a tuple containing the string made by relpacing in the | |
/// `String` all HTML character entity references with the corresponding | |
/// character. Also returned is an array of offset information describing | |
/// the location and length offsets for each replacement. This allows | |
/// for the correct adjust any attributes that may be associated with | |
/// with substrings within the `String` | |
func decodeHTMLEntities() -> (decodedString: String, replacementOffsets: [(index: String.Index, offset: String.Index.Distance)]) { | |
// ===== Utility functions ===== | |
// Record the index offsets of each replacement | |
// This allows anyone to correctly adjust any attributes that may be | |
// associated with substrings within the string | |
var replacementOffsets: [(index: String.Index, offset: String.Index.Distance)] = [] | |
// Convert the number in the string to the corresponding | |
// Unicode character, e.g. | |
// decodeNumeric("64", 10) --> "@" | |
// decodeNumeric("20ac", 16) --> "€" | |
func decodeNumeric(string : String, base : Int32) -> Character? { | |
let code = UInt32(strtoul(string, nil, base)) | |
return Character(UnicodeScalar(code)) | |
} | |
// Decode the HTML character entity to the corresponding | |
// Unicode character, return `nil` for invalid input. | |
// decode("@") --> "@" | |
// decode("€") --> "€" | |
// decode("<") --> "<" | |
// decode("&foo;") --> nil | |
func decode(entity : String) -> Character? { | |
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){ | |
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 3)), 16) | |
} else if entity.hasPrefix("&#") { | |
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 2)), 10) | |
} else { | |
return characterEntities[entity] | |
} | |
} | |
// ===== Method starts here ===== | |
var result = "" | |
var position = startIndex | |
// Find the next '&' and copy the characters preceding it to `result`: | |
while let ampRange = self.rangeOfString("&", range: position ..< endIndex) { | |
result.extend(self[position ..< ampRange.startIndex]) | |
position = ampRange.startIndex | |
// Find the next ';' and copy everything from '&' to ';' into `entity` | |
if let semiRange = self.rangeOfString(";", range: position ..< endIndex) { | |
let entity = self[position ..< semiRange.endIndex] | |
if let decoded = decode(entity) { | |
// Replace by decoded character: | |
result.append(decoded) | |
// Record offset | |
let offset = (index: semiRange.endIndex, offset: 1 - distance(position, semiRange.endIndex)) | |
replacementOffsets.append(offset) | |
} else { | |
// Invalid entity, copy verbatim: | |
result.extend(entity) | |
} | |
position = semiRange.endIndex | |
} else { | |
// No matching ';'. | |
break | |
} | |
} | |
// Copy remaining characters to `result`: | |
result.extend(self[position ..< endIndex]) | |
// Return results | |
return (decodedString: result, replacementOffsets: replacementOffsets) | |
} | |
} |
extension String {
func htmlDecoded()->String {
guard (self != "") else { return self }
var newStr = self
// from https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
let entities = [ //a dictionary of HTM/XML entities.
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
"°" : "º",
]
for (name,value) in entities {
newStr = newStr.replacingOccurrences(of: name, with: value)
}
return newStr
}
}
var input = "Mostly dry. Warm (max 28°C on Thu morning, min 13°C on Wed night). Wind will be generally light."
print(input)
var output = input.htmlDecoded()
print(output)
some manual work needed to extend the above for Swift 3.
@marbetschar any links on how to use the example you linked to?
Swift 4 anyone? https://gist.github.com/nathanfjohnson/380b9f24c991a8970144e13ddd044d21 Many changes and updates to the original approach.
for swift 3 :-
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : String, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.range(of: "&", range: position ..< endIndex) {
result.append(self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.range(of: ";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.append(self[position ..< endIndex])
return result
}
}
Swift 5+
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : String, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(
String(entity[
entity.index(entity.startIndex, offsetBy: 3)
..< entity.index(entity.endIndex, offsetBy: -1)
]), base: 16
)
} else if entity.hasPrefix("&#") {
return decodeNumeric(
String(entity[
entity.index(entity.startIndex, offsetBy: 2)
..< entity.index(entity.endIndex, offsetBy: -1)
]), base: 10
)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.range(of: "&", range: position ..< endIndex) {
result.append(String(self[position ..< ampRange.lowerBound]))
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.range(of: ";", range: position ..< endIndex) {
let entity = String(self[position ..< semiRange.upperBound])
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.append(String(self[position ..< endIndex]))
return result
}
}
Swift 5.7+
With the Regex Literal support introduced in Swift 5.7, this process has gotten much simpler:
let test: String = "Insisting on > splitting on the first < date"
var cleanText: String = ""
// Matches an HTML entity encoding, such as `<` or ` `
let regex = /&[A-Za-z0-9]{2,};/
var lastIndex: String.Index = test.startIndex
let ranges: [Range<String.Index>] = test.ranges(of: regex)
for range: Range<String.Index> in ranges
{
if range.lowerBound > lastIndex
{
cleanText.append(String(test[lastIndex ..< range.lowerBound]))
}
let doomedText: String = String(test[range.lowerBound ..< range.upperBound])
if let newCharacter: Character = characterEntities[doomedText]
{
cleanText.append(newCharacter)
lastIndex = range.upperBound
}
else
{
lastIndex = range.lowerBound
}
}
if lastIndex < test.endIndex
{
cleanText.append(String(test[lastIndex ..< test.endIndex]))
}
Warning
I haven't really tested all the edge cases in the above snippet and I typed it on GitHub, so it may require tweaking. Still, it's way more streamlined than the older approach and the performance is phenomenal.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@augustorsouza: had the same issue. Here fore future reference: https://gitlab.com/snippets/32429