Created
February 12, 2019 09:34
-
-
Save jeiea/2c153a3affe630dfabc53691b20d609b to your computer and use it in GitHub Desktop.
html to plain text kotlin port of https://stackoverflow.com/a/50363077
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* It depends on JSoup. | |
* Kotlin port of https://stackoverflow.com/a/50363077 | |
*/ | |
object Utils { | |
private val block = "address|article|aside|blockquote|canvas|dd|div|dl|dt|" + | |
"fieldset|figcaption|figure|footer|form|h\\d|header|hr|li|main|nav|" + | |
"noscript|ol|output|p|pre|section|table|tfoot|ul|video" | |
private val rxlongWSpaces = Regex("""\s{2,}""") | |
private val rxNestedBlock = Regex("""(\s*?</?(${block})[^>]*?>)+\s*""", RegexOption.IGNORE_CASE) | |
private val rxBrToNewLine = Regex("""<br[^>]*>""", RegexOption.IGNORE_CASE) | |
private val removeAllTags = Regex("""<[^>]*(>|$)""", RegexOption.MULTILINE) | |
fun htmlToPlainText(html: String): String { | |
var buf = html | |
buf = rxlongWSpaces.replace(buf, " ") | |
buf = rxNestedBlock.replace(buf, "\n").trim() | |
buf = rxBrToNewLine.replace(buf, "\n") | |
buf = removeAllTags.replace(buf, "") | |
buf = Parser.unescapeEntities(buf, false) | |
return buf; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment