Skip to content

Instantly share code, notes, and snippets.

@uhbif19
Created February 11, 2014 19:11
Show Gist options
  • Save uhbif19/8941865 to your computer and use it in GitHub Desktop.
Save uhbif19/8941865 to your computer and use it in GitHub Desktop.
HTML Parser using TagSoup
libraryDependencies += "org.ccil.cowan.tagsoup" % "tagsoup" % "1.2.1"
object HTML {
import scala.xml.Node
import scala.xml.parsing.NoBindingFactoryAdapter
import org.xml.sax.InputSource
import java.io.ByteArrayInputStream
import org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
lazy val adapter = new NoBindingFactoryAdapter()
lazy val parser = (new SAXFactoryImpl).newSAXParser
def parse(html: String, encoding: String = "UTF-8"): Node = {
return this.parse(html.getBytes(encoding))
}
def parse(html: Array[Byte]): Node = {
val stream = new ByteArrayInputStream(html)
val source = new InputSource(stream)
return adapter.loadXML(source, parser)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment