Created
December 18, 2017 05:21
-
-
Save jaredsinclair/268457c96867a0ac93540d6105a8afa1 to your computer and use it in GitHub Desktop.
Convert HTML string to plain text (Swift version of MWFeedParser's utility)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Quick-n-dirty translation of MWFeedParser's algorithm from Objective-C to Swift | |
/// seealso: https://github.com/mwaterfall/MWFeedParser/blob/master/Classes/NSString%2BHTML.m | |
public extension NSString { | |
public func byConvertingHTMLToPlainText() -> String { | |
let stopCharacters = CharacterSet(charactersIn: "< \t\n\r\(0x0085)\(0x000C)\(0x2028)\(0x2029)") | |
let newLineAndWhitespaceCharacters = CharacterSet(charactersIn: " \t\n\r\(0x0085)\(0x000C)\(0x2028)\(0x2029)") | |
let tagNameCharacters = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
let result = NSMutableString(capacity: length) | |
let scanner = Scanner(string: self as String) | |
scanner.charactersToBeSkipped = nil | |
scanner.caseSensitive = true | |
var str: NSString? = nil | |
var tagName: NSString? = nil | |
var dontReplaceTagWithSpace = false | |
repeat { | |
// Scan up to the start of a tag or whitespace | |
if scanner.scanUpToCharacters(from: stopCharacters, into: &str), let s = str { | |
result.append(s as String) | |
str = nil | |
} | |
// Check if we've stopped at a tag/comment or whitespace | |
if scanner.scanString("<", into: nil) { | |
// Stopped at a comment, script tag, or other tag | |
if scanner.scanString("!--", into: nil) { | |
// Comment | |
scanner.scanUpTo("-->", into: nil) | |
scanner.scanString("-->", into: nil) | |
} else if scanner.scanString("script", into: nil) { | |
// Script tag where things don't need escaping! | |
scanner.scanUpTo("</script>", into: nil) | |
scanner.scanString("</script>", into: nil) | |
} else { | |
// Tag - remove and replace with space unless it's | |
// a closing inline tag then dont replace with a space | |
if scanner.scanString("/", into: nil) { | |
// Closing tag - replace with space unless it's inline | |
tagName = nil | |
dontReplaceTagWithSpace = false | |
if scanner.scanCharacters(from: tagNameCharacters, into: &tagName), let t = tagName { | |
tagName = t.lowercased as NSString | |
dontReplaceTagWithSpace = | |
tagName == "a" || | |
tagName == "b" || | |
tagName == "i" || | |
tagName == "q" || | |
tagName == "span" || | |
tagName == "em" || | |
tagName == "strong" || | |
tagName == "cite" || | |
tagName == "abbr" || | |
tagName == "acronym" || | |
tagName == "label" | |
} | |
// Replace tag with string unless it was an inline | |
if !dontReplaceTagWithSpace && result.length > 0 && !scanner.isAtEnd { | |
result.append(" ") | |
} | |
} | |
// Scan past tag | |
scanner.scanUpTo(">", into: nil) | |
scanner.scanString(">", into: nil) | |
} | |
} else { | |
// Stopped at whitespace - replace all whitespace and newlines with a space | |
if scanner.scanCharacters(from: newLineAndWhitespaceCharacters, into: nil) { | |
if result.length > 0 && !scanner.isAtEnd { | |
result.append(" ") // Dont append space to beginning or end of result | |
} | |
} | |
} | |
} while !scanner.isAtEnd | |
// Cleanup | |
// Decode HTML entities and return (this isn't included in this gist, but is often important) | |
// let retString = (result as String).stringByDecodingHTMLEntities | |
// Return | |
return result as String // retString; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Helpful, but still need improvement