Created
February 11, 2014 19:11
-
-
Save uhbif19/8941865 to your computer and use it in GitHub Desktop.
HTML Parser using TagSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
libraryDependencies += "org.ccil.cowan.tagsoup" % "tagsoup" % "1.2.1" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object HTML { | |
import scala.xml.Node | |
import scala.xml.parsing.NoBindingFactoryAdapter | |
import org.xml.sax.InputSource | |
import java.io.ByteArrayInputStream | |
import org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl | |
lazy val adapter = new NoBindingFactoryAdapter() | |
lazy val parser = (new SAXFactoryImpl).newSAXParser | |
def parse(html: String, encoding: String = "UTF-8"): Node = { | |
return this.parse(html.getBytes(encoding)) | |
} | |
def parse(html: Array[Byte]): Node = { | |
val stream = new ByteArrayInputStream(html) | |
val source = new InputSource(stream) | |
return adapter.loadXML(source, parser) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment