Skip to content

Instantly share code, notes, and snippets.

@binarytemple
Last active August 29, 2015 14:01
Show Gist options
  • Save binarytemple/dc0750381572e1e80e50 to your computer and use it in GitHub Desktop.
Save binarytemple/dc0750381572e1e80e50 to your computer and use it in GitHub Desktop.
Scala html parsing

Some tricks for parsing html using Scala

import scala.xml.factory.XMLLoader
import scala.xml._
import scala.xml.parsing.{NoBindingFactoryAdapter}

object MyReader extends App {

  //val parser1: XMLLoader[Elem] = XML.
  //  withSAXParser(new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl().newSAXParser())


  private val parser: XMLLoader[Elem] =
    new XMLLoader[Elem] {

      import _root_.scala.xml.parsing.FactoryAdapter

      override def adapter: FactoryAdapter = new NoBindingFactoryAdapter() {
        val emptyElements = Set("area", "base", "br", "col", "hr", "img",
          "head", "title", "input", "link", "meta", "param")

        override def nodeContainsText(localName: String) =
          !(emptyElements contains localName)

        override def endElement(uri: String, _localName: String, qname: String): Unit =
          if(qname!= "head")
            super.endElement(uri, _localName, qname)
        else
            ()
        
        
      }


      override val parser: SAXParser =
        new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl().newSAXParser()


    }


  val string: Elem = parser.loadString(
    """
      | <html>
      | <head>
      | <title>foo</title>
      | </head>
      | <body>
      | <br/>
      | <a href="google.com">google</a>
      | <a href="foo.com">foo</a>
      | </body>
      | </html>
    """.stripMargin)

  println(string)
  println((string \\ "a").map(_.attribute("href")))

}

The above code produces the following output:

<html><title/><body>
 <br clear="none"/>
 <a href="google.com" shape="rect">google</a>
 <a href="foo.com" shape="rect">foo</a>
 </body></html>
List(Some(google.com), Some(foo.com))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment