Some tricks for parsing html using Scala
import scala.xml.factory.XMLLoader
import scala.xml._
import scala.xml.parsing.{NoBindingFactoryAdapter}
object MyReader extends App {
//val parser1: XMLLoader[Elem] = XML.
// withSAXParser(new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl().newSAXParser())
private val parser: XMLLoader[Elem] =
new XMLLoader[Elem] {
import _root_.scala.xml.parsing.FactoryAdapter
override def adapter: FactoryAdapter = new NoBindingFactoryAdapter() {
val emptyElements = Set("area", "base", "br", "col", "hr", "img",
"head", "title", "input", "link", "meta", "param")
override def nodeContainsText(localName: String) =
!(emptyElements contains localName)
override def endElement(uri: String, _localName: String, qname: String): Unit =
if(qname!= "head")
super.endElement(uri, _localName, qname)
else
()
}
override val parser: SAXParser =
new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl().newSAXParser()
}
val string: Elem = parser.loadString(
"""
| <html>
| <head>
| <title>foo</title>
| </head>
| <body>
| <br/>
| <a href="google.com">google</a>
| <a href="foo.com">foo</a>
| </body>
| </html>
""".stripMargin)
println(string)
println((string \\ "a").map(_.attribute("href")))
}
The above code produces the following output:
<html><title/><body>
<br clear="none"/>
<a href="google.com" shape="rect">google</a>
<a href="foo.com" shape="rect">foo</a>
</body></html>
List(Some(google.com), Some(foo.com))