Last active
March 19, 2024 15:24
-
-
Save jbadger3/7b386e7f284a96d8c8588fcbb459ddb5 to your computer and use it in GitHub Desktop.
Gist for the article 'Parsing Character Entities from HTML/XML Content In Swift' on Medium.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private let characterEntities : [String: Character] = [ | |
// XML predefined entities: | |
""" : "\"", | |
"&" : "&", | |
"'" : "'", | |
"<" : "<", | |
">" : ">", | |
// HTML character entity references: | |
" " : "\u{00A0}", | |
"¡" : "\u{00A1}", | |
"¢" : "\u{00A2}", | |
"£" : "\u{00A3}", | |
"¤" : "\u{00A4}", | |
"¥" : "\u{00A5}", | |
"¦" : "\u{00A6}", | |
"§" : "\u{00A7}", | |
"¨" : "\u{00A8}", | |
"©" : "\u{00A9}", | |
"ª" : "\u{00AA}", | |
"«" : "\u{00AB}", | |
"¬" : "\u{00AC}", | |
"­" : "\u{00AD}", | |
"®" : "\u{00AE}", | |
"¯" : "\u{00AF}", | |
"°" : "\u{00B0}", | |
"±" : "\u{00B1}", | |
"²" : "\u{00B2}", | |
"³" : "\u{00B3}", | |
"´" : "\u{00B4}", | |
"µ" : "\u{00B5}", | |
"¶" : "\u{00B6}", | |
"·" : "\u{00B7}", | |
"¸" : "\u{00B8}", | |
"¹" : "\u{00B9}", | |
"º" : "\u{00BA}", | |
"»" : "\u{00BB}", | |
"¼" : "\u{00BC}", | |
"½" : "\u{00BD}", | |
"¾" : "\u{00BE}", | |
"¿" : "\u{00BF}", | |
"À" : "\u{00C0}", | |
"Á" : "\u{00C1}", | |
"Â" : "\u{00C2}", | |
"Ã" : "\u{00C3}", | |
"Ä" : "\u{00C4}", | |
"Å" : "\u{00C5}", | |
"Æ" : "\u{00C6}", | |
"Ç" : "\u{00C7}", | |
"È" : "\u{00C8}", | |
"É" : "\u{00C9}", | |
"Ê" : "\u{00CA}", | |
"Ë" : "\u{00CB}", | |
"Ì" : "\u{00CC}", | |
"Í" : "\u{00CD}", | |
"Î" : "\u{00CE}", | |
"Ï" : "\u{00CF}", | |
"Ð" : "\u{00D0}", | |
"Ñ" : "\u{00D1}", | |
"Ò" : "\u{00D2}", | |
"Ó" : "\u{00D3}", | |
"Ô" : "\u{00D4}", | |
"Õ" : "\u{00D5}", | |
"Ö" : "\u{00D6}", | |
"×" : "\u{00D7}", | |
"Ø" : "\u{00D8}", | |
"Ù" : "\u{00D9}", | |
"Ú" : "\u{00DA}", | |
"Û" : "\u{00DB}", | |
"Ü" : "\u{00DC}", | |
"Ý" : "\u{00DD}", | |
"Þ" : "\u{00DE}", | |
"ß" : "\u{00DF}", | |
"à" : "\u{00E0}", | |
"á" : "\u{00E1}", | |
"â" : "\u{00E2}", | |
"ã" : "\u{00E3}", | |
"ä" : "\u{00E4}", | |
"å" : "\u{00E5}", | |
"æ" : "\u{00E6}", | |
"ç" : "\u{00E7}", | |
"è" : "\u{00E8}", | |
"é" : "\u{00E9}", | |
"ê" : "\u{00EA}", | |
"ë" : "\u{00EB}", | |
"ì" : "\u{00EC}", | |
"í" : "\u{00ED}", | |
"î" : "\u{00EE}", | |
"ï" : "\u{00EF}", | |
"ð" : "\u{00F0}", | |
"ñ" : "\u{00F1}", | |
"ò" : "\u{00F2}", | |
"ó" : "\u{00F3}", | |
"ô" : "\u{00F4}", | |
"õ" : "\u{00F5}", | |
"ö" : "\u{00F6}", | |
"÷" : "\u{00F7}", | |
"ø" : "\u{00F8}", | |
"ù" : "\u{00F9}", | |
"ú" : "\u{00FA}", | |
"û" : "\u{00FB}", | |
"ü" : "\u{00FC}", | |
"ý" : "\u{00FD}", | |
"þ" : "\u{00FE}", | |
"ÿ" : "\u{00FF}", | |
"Œ" : "\u{0152}", | |
"œ" : "\u{0153}", | |
"Š" : "\u{0160}", | |
"š" : "\u{0161}", | |
"Ÿ" : "\u{0178}", | |
"ƒ" : "\u{0192}", | |
"ˆ" : "\u{02C6}", | |
"˜" : "\u{02DC}", | |
"Α" : "\u{0391}", | |
"Β" : "\u{0392}", | |
"Γ" : "\u{0393}", | |
"Δ" : "\u{0394}", | |
"Ε" : "\u{0395}", | |
"Ζ" : "\u{0396}", | |
"Η" : "\u{0397}", | |
"Θ" : "\u{0398}", | |
"Ι" : "\u{0399}", | |
"Κ" : "\u{039A}", | |
"Λ" : "\u{039B}", | |
"Μ" : "\u{039C}", | |
"Ν" : "\u{039D}", | |
"Ξ" : "\u{039E}", | |
"Ο" : "\u{039F}", | |
"Π" : "\u{03A0}", | |
"Ρ" : "\u{03A1}", | |
"Σ" : "\u{03A3}", | |
"Τ" : "\u{03A4}", | |
"Υ" : "\u{03A5}", | |
"Φ" : "\u{03A6}", | |
"Χ" : "\u{03A7}", | |
"Ψ" : "\u{03A8}", | |
"Ω" : "\u{03A9}", | |
"α" : "\u{03B1}", | |
"β" : "\u{03B2}", | |
"γ" : "\u{03B3}", | |
"δ" : "\u{03B4}", | |
"ε" : "\u{03B5}", | |
"ζ" : "\u{03B6}", | |
"η" : "\u{03B7}", | |
"θ" : "\u{03B8}", | |
"ι" : "\u{03B9}", | |
"κ" : "\u{03BA}", | |
"λ" : "\u{03BB}", | |
"μ" : "\u{03BC}", | |
"ν" : "\u{03BD}", | |
"ξ" : "\u{03BE}", | |
"ο" : "\u{03BF}", | |
"π" : "\u{03C0}", | |
"ρ" : "\u{03C1}", | |
"ς" : "\u{03C2}", | |
"σ" : "\u{03C3}", | |
"τ" : "\u{03C4}", | |
"υ" : "\u{03C5}", | |
"φ" : "\u{03C6}", | |
"χ" : "\u{03C7}", | |
"ψ" : "\u{03C8}", | |
"ω" : "\u{03C9}", | |
"ϑ" : "\u{03D1}", | |
"ϒ" : "\u{03D2}", | |
"ϖ" : "\u{03D6}", | |
" " : "\u{2002}", | |
" " : "\u{2003}", | |
" " : "\u{2009}", | |
"‌" : "\u{200C}", | |
"‍" : "\u{200D}", | |
"‎" : "\u{200E}", | |
"‏" : "\u{200F}", | |
"–" : "\u{2013}", | |
"—" : "\u{2014}", | |
"‘" : "\u{2018}", | |
"’" : "\u{2019}", | |
"‚" : "\u{201A}", | |
"“" : "\u{201C}", | |
"”" : "\u{201D}", | |
"„" : "\u{201E}", | |
"†" : "\u{2020}", | |
"‡" : "\u{2021}", | |
"•" : "\u{2022}", | |
"…" : "\u{2026}", | |
"‰" : "\u{2030}", | |
"′" : "\u{2032}", | |
"″" : "\u{2033}", | |
"‹" : "\u{2039}", | |
"›" : "\u{203A}", | |
"‾" : "\u{203E}", | |
"⁄" : "\u{2044}", | |
"€" : "\u{20AC}", | |
"ℑ" : "\u{2111}", | |
"℘" : "\u{2118}", | |
"ℜ" : "\u{211C}", | |
"™" : "\u{2122}", | |
"ℵ" : "\u{2135}", | |
"←" : "\u{2190}", | |
"↑" : "\u{2191}", | |
"→" : "\u{2192}", | |
"↓" : "\u{2193}", | |
"↔" : "\u{2194}", | |
"↵" : "\u{21B5}", | |
"⇐" : "\u{21D0}", | |
"⇑" : "\u{21D1}", | |
"⇒" : "\u{21D2}", | |
"⇓" : "\u{21D3}", | |
"⇔" : "\u{21D4}", | |
"∀" : "\u{2200}", | |
"∂" : "\u{2202}", | |
"∃" : "\u{2203}", | |
"∅" : "\u{2205}", | |
"∇" : "\u{2207}", | |
"∈" : "\u{2208}", | |
"∉" : "\u{2209}", | |
"∋" : "\u{220B}", | |
"∏" : "\u{220F}", | |
"∑" : "\u{2211}", | |
"−" : "\u{2212}", | |
"∗" : "\u{2217}", | |
"√" : "\u{221A}", | |
"∝" : "\u{221D}", | |
"∞" : "\u{221E}", | |
"∠" : "\u{2220}", | |
"∧" : "\u{2227}", | |
"∨" : "\u{2228}", | |
"∩" : "\u{2229}", | |
"∪" : "\u{222A}", | |
"∫" : "\u{222B}", | |
"∴" : "\u{2234}", | |
"∼" : "\u{223C}", | |
"≅" : "\u{2245}", | |
"≈" : "\u{2248}", | |
"≠" : "\u{2260}", | |
"≡" : "\u{2261}", | |
"≤" : "\u{2264}", | |
"≥" : "\u{2265}", | |
"⊂" : "\u{2282}", | |
"⊃" : "\u{2283}", | |
"⊄" : "\u{2284}", | |
"⊆" : "\u{2286}", | |
"⊇" : "\u{2287}", | |
"⊕" : "\u{2295}", | |
"⊗" : "\u{2297}", | |
"⊥" : "\u{22A5}", | |
"⋅" : "\u{22C5}", | |
"⌈" : "\u{2308}", | |
"⌉" : "\u{2309}", | |
"⌊" : "\u{230A}", | |
"⌋" : "\u{230B}", | |
"⟨" : "\u{2329}", | |
"⟩" : "\u{232A}", | |
"◊" : "\u{25CA}", | |
"♠" : "\u{2660}", | |
"♣" : "\u{2663}", | |
"♥" : "\u{2665}", | |
"♦" : "\u{2666}", | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extension String { | |
init?(htmlEncodedString: String) { | |
guard let data = htmlEncodedString.data(using: .utf8) else { | |
return nil | |
} | |
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [ | |
.documentType: NSAttributedString.DocumentType.html, | |
.characterEncoding: String.Encoding.utf8.rawValue | |
] | |
guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else { | |
return nil | |
} | |
self.init(attributedString.string) | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let htmlString = "Easy peasy lemon squeezy. 🍋" | |
let fixedString = String(htmlEncodedString: htmlString) | |
print(fixedString) | |
Optional("Easy peasy lemon squeezy. ๐") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private let characterEntities : [String: Character] = [ | |
// XML predefined entities: | |
""" : "\"", | |
"&" : "&", | |
"'" : "'", | |
"<" : "<", | |
">" : ">", | |
// HTML character entity references: | |
" " : "\u{00A0}", | |
"¡" : "\u{00A1}", | |
"¢" : "\u{00A2}"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let htmlString = "Easy peasy lemon squeezy. 🍋" | |
let fixedString = htmlString.replacingCharacterEntities() | |
print(fixedString) | |
Easy peasy lemon squeezy. ๐ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extension String { | |
func replacingCharacterEntities() -> String { | |
func unicodeScalar(for numericCharacterEntity: String) -> Unicode.Scalar? { | |
var unicodeString = "" | |
for character in numericCharacterEntity { | |
if "0123456789".contains(character) { | |
unicodeString.append(character) | |
} | |
} | |
if let scalarInt = Int(unicodeString), | |
let unicodeScalar = Unicode.Scalar(scalarInt) { | |
return unicodeScalar | |
} | |
return nil | |
} | |
var result = "" | |
var position = self.startIndex | |
let range = NSRange(self.startIndex..<self.endIndex, in: self) | |
let pattern = #"(&\S*?;)"# | |
let unicodeScalarPattern = #"&#(\d*?);"# | |
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { return self } | |
regex.enumerateMatches(in: self, options: [], range: range) { matches, flags, stop in | |
if let matches = matches { | |
if let range = Range(matches.range(at: 0), in:self) { | |
let rangePreceedingMatch = position..<range.lowerBound | |
result.append(contentsOf: self[rangePreceedingMatch]) | |
let characterEntity = String(self[range]) | |
if let replacement = characterEntities[characterEntity] { | |
result.append(replacement) | |
} else if let _ = characterEntity.range(of: unicodeScalarPattern, options: .regularExpression), | |
let unicodeScalar = unicodeScalar(for: characterEntity) { | |
result.append(String(unicodeScalar)) | |
} | |
position = self.index(range.lowerBound, offsetBy: characterEntity.count ) | |
} | |
} | |
} | |
if position != self.endIndex { | |
result.append(contentsOf: self[position..<self.endIndex]) | |
} | |
return result | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment