Created
February 11, 2011 15:26
-
-
Save pchiusano/822494 to your computer and use it in GitHub Desktop.
Adapter to use scala's parser combinators for XML parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package xmlcombinators | |
import scala.util.parsing.combinator.Parsers | |
import scala.util.parsing.input.{NoPosition, Reader} | |
import javax.xml.stream.events.{Attribute, EndElement, XMLEvent} | |
import javax.xml.stream.{XMLEventReader, XMLInputFactory} | |
import collection.mutable.ArrayBuffer | |
import java.io.File | |
/** | |
* License: MIT license (http://www.opensource.org/licenses/mit-license.php) | |
* Copyright (C) 2011 by Capital IQ | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
* | |
* @author Anthony Enache, Paul Chiusano | |
*/ | |
/** Adapter to run parser combinators using XMLEvent as the token type. | |
* Currently works by grabbing the full list of events up front from a | |
* javax.xml.stream.XMLEventReader, so this is not suitable for streaming. | |
* | |
* Examples: | |
* startElement("foo") <~ endElement("foo") applied to <foo bar="qux" baz="juju"></foo> | |
* results in the Map[String,String] of attrs for the tag 'foo': Map("bar"->"qux", "baz"->"juju") | |
* | |
* startElement("foo") ~ rep(bar) <~ endElement("foo") map { case attrs ~ bars => ... } | |
* | |
* Generally, it's better to use the nest and element combinators, to avoid having to | |
* manually match start and end tags in the grammar. | |
*/ | |
trait XMLEventParser extends Parsers { | |
type Elem = XMLEvent | |
implicit def listToJavaList[T](l: Seq[T]) = l.foldLeft(new java.util.ArrayList[T](l.size)) { | |
(al, e) => al.add(e) | |
al | |
} | |
/** We return a map of the attributes defined by the start element tag. */ | |
def startElement(s: String): Parser[Map[String,String]] = new Parser[Map[String,String]] { | |
def apply(in: Input) = { | |
val elt = in.first | |
if ( (elt ne null) && elt.isStartElement && elt.asStartElement.getName.getLocalPart == s) { | |
val as = elt.asStartElement.getAttributes | |
var attributes = Map[String, String]() | |
while( as.hasNext ) { | |
val attr = as.next.asInstanceOf[Attribute] | |
attributes += ( attr.getName.getLocalPart -> attr.getValue ) | |
} | |
Success(attributes, in.rest) | |
} else { | |
Failure("Expected start element with label " + s + ", but found " + elt, in.rest) | |
} | |
} | |
} | |
def endElement(s: String) : Parser[EndElement] = new Parser[EndElement] { | |
def apply(in: Input) = { | |
val elt = in.first | |
if ( elt.isEndElement && elt.asEndElement.getName.getLocalPart == s) { | |
Success(elt.asEndElement, in.rest) | |
} else { | |
Failure("Expected end element with label " + s + ", but found " + elt, in.rest) | |
} | |
} | |
} | |
/** Parser for leaf elements of type <tag attr1="" attr2="" .../>. From the reader, this would generate | |
* a start element and end element event, of which, only the start element is truly interesting as it | |
* carries the attributes with it. | |
*/ | |
def element(s: String) : Parser[Map[String, String]] = startElement(s) <~ endElement(s) | |
/** Parser for non-leaf elements. */ | |
def element[A,B](s: String, inner: Parser[A])(f: ((Map[String,String],A)) => B): Parser[B] = | |
startElement(s) ~ inner <~ endElement(s) ^^ { case attrs ~ a => f((attrs, a)) } | |
/** Parser for leaf elements with attributes. The function f extracts an A from these attrs. */ | |
def element[A](s: String, f: Map[String,String] => A): Parser[A] = | |
startElement(s).map(f) <~ endElement(s) | |
/** Parser for elements surrounded by the tag withinTag, whose attributes are ignored. */ | |
def nest[A](withinTag: String, inner: Parser[A]): Parser[A] = | |
element[A,A](withinTag, inner) { case (attrs,a) => a } | |
/** Typesafe choice between two parsers. */ | |
def choice[A,B](p: Parser[A], p2: Parser[B]): Parser[Either[A,B]] = | |
p.map(a => Left(a)) | p2.map(b => Right(b)) | |
/** Parser for text within a tag. Example: | |
* nest("foo", textElement) applied to '<foo>thisText</foo>' results in 'thisText'. | |
*/ | |
def textElement: Parser[String] = new Parser[String] { | |
def apply(in: Input) = { | |
val elt = in.first | |
if (elt.isCharacters) { | |
Success(elt.asCharacters.getData, in.rest) | |
} else { | |
Failure("Expected text element, but found " + elt, in.rest) | |
} | |
} | |
} | |
/** Parser that extracts the text from an element <s>text</s> */ | |
def textOnlyElement(s: String): Parser[String] = nest(s, textElement) | |
/** Parser that extracts an optional text only elelemnt. */ | |
def optionalTextOnlyElement(s: String): Parser[Option[String]] = nest(s, opt(textElement)) | |
/** Parse some prefix of reader `in' with parser `p' */ | |
def parse[T](p: Parser[T], in: Reader[XMLEvent]): ParseResult[T] = | |
p(in) | |
/** Parse all of reader `in' with parser `p' */ | |
def parseAll[T](p: Parser[T], in: Reader[XMLEvent]): ParseResult[T] = | |
parse(phrase(p), in) | |
/** Extract the mandatory attribute from the attribute map */ | |
def attribute(as: Map[String, String], a: String) = { | |
require(as.contains(a)) | |
as(a) | |
} | |
/** Extract the value of the attribute, if it exists, otherwise use the default */ | |
def attribute(as: Map[String, String], a: String, default: String) = as.getOrElse(a, default) | |
} | |
object EventReader { | |
def readEvents(r: java.io.Reader): Array[XMLEvent] = { | |
var result = new ArrayBuffer[XMLEvent]() | |
val factory = XMLInputFactory.newInstance() | |
val reader : XMLEventReader = factory.createXMLEventReader(r) | |
while (reader.hasNext) { | |
var event = reader.nextEvent() | |
if ( !event.isStartDocument && !event.isEndDocument ) { | |
if ( event.isCharacters ) { | |
if ( !event.asCharacters.isWhiteSpace ) { | |
result += event | |
} | |
} else { | |
result += event | |
} | |
} | |
} | |
result.toArray | |
} | |
} | |
/** | |
* Note that this implementation of the header will eat whitespace including document start and end events | |
*/ | |
class EventReader(index: Int, events: Array[XMLEvent] ) extends Reader[XMLEvent] { | |
def this(r: java.io.Reader) = this(0, EventReader.readEvents(r)) | |
def atEnd = index == events.size - 1 | |
def first = events(index) | |
def rest = if ( atEnd ) this else new EventReader(index + 1, events) | |
def pos = NoPosition | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment