Created
February 20, 2012 18:24
-
-
Save bblfish/1870493 to your computer and use it in GitHub Desktop.
Nomo Turtle recurion issue - (see PN_PREFIX_1, PN_PREFIX_2, ...)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2012 Henry Story | |
* under the Open Source MIT Licence http://www.opensource.org/licenses/MIT | |
*/ | |
package org.w3.rdf | |
import nomo._ | |
import nomo.Errors.{TreeError, Single} | |
import scala.collection.mutable | |
import org.w3.rdf.Module | |
trait ListenerAgent[T] { | |
def send(a: T) | |
} | |
/** | |
* Async Parser for the simplest of all RDF encodings: NTriples | |
* http://www.w3.org/TR/rdf-testcases/#ntriples | |
* | |
* This is using the nomo library that is being developed | |
* here: https://bitbucket.org/pchiusano/nomo | |
* | |
* @author bblfish | |
* @since 02/02/2012 | |
*/ | |
class NTriplesParser[M <: Module,F,E,X,U <: ListenerAgent[Any]](val m: M, val P: Parsers[F, Char, E, X, U]) { | |
//todo: can't work out how to get the right dependent type for ListenerAgent. Should be ListenerAgent[m.Triple] | |
import m._ | |
//todo: do we really need a tree error for such a simple language (what do TreeErrors enable?) | |
implicit def toTreeError(msg: String): Errors.TreeError = Errors.Single(msg, None) | |
val alpha_digit_dash = "abcdefghijklmnopqrstuvwxyz0123456789-" | |
val hexadecimalChars = "1234567890ABCDEFabcdef" | |
def hex = P.anyOf(hexadecimalChars) | |
val lang = P.takeWhile1(c => alpha_digit_dash.contains(c.toLower), | |
pos => P.err.single('!',pos)).map(l => Lang(l.toSeq.mkString)) | |
val space1 = P.takeWhile1( c => c == ' '|| c == '\t', pos => P.err.single('!',pos)) | |
val space = P.takeWhile( c => c == ' '|| c == '\t' ) | |
val anySpace = P.takeWhile(_.isWhitespace ) | |
val eoln = P.word("\n") | P.word ("\r\n")| P.word("\r") | |
def isUriChar(c: Char) = ( ! c.isWhitespace) && c != '<' && c != '>' && | |
c> 0x1F && (c < 0x7F || c > 0x9F ) //control characters | |
import P.++ | |
val bnode = P.word("_:")>>P.takeWhile1(_.isLetterOrDigit,pos => P.err.single('!',pos)).map (n=>BNode(n.toSeq.mkString)) | |
val u_CHAR = (P.word("\\u")>> hex++hex++hex++hex) map { | |
case c1++c2++c3++c4 => Integer.parseInt(new String(Array(c1,c2,c3,c4)),16).toChar | |
} | |
val U_CHAR = (P.word("\\U")>> hex++hex++hex++hex++hex++hex++hex++hex) map { | |
case c1++c2++c3++c4++c5++c6++c7++c8 => Integer.parseInt(new String(Array(c1,c2,c3,c4,c5,c6,c7,c8)),16).toChar | |
} | |
val lt_tab = P.word("\\t").map(c=>0x9.toChar) | |
val lt_cr = P.word("\\r").map(c=>0xD.toChar) | |
val lt_nl = P.word("\\n").map(c=>0xA.toChar) | |
val lt_slash = P.word("\\\\").map(c=>'\\') | |
val lt_quote = P.word("\\\"").map(c=>'"'.toChar) | |
val literal = ( u_CHAR | U_CHAR | lt_tab | lt_cr | lt_nl | lt_slash | lt_quote | | |
P.takeWhile1(c=> c!= '\\' && c != '"', pos => P.err.single('!',pos)).map(n=>n.toSeq.mkString) | |
).many.map(l=> l.toSeq.mkString) | |
val uriStr = (u_CHAR | U_CHAR | lt_slash | lt_quote | | |
P.takeWhile1(c => isUriChar(c),pos => P.err.single('!',pos)).map(n=>n.toSeq.mkString) | |
).many1.map(i=>i.toSeq.mkString) | |
val xsd = "http://www.w3.org/2001/XMLSchema#" | |
val rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" | |
val xsdString = IRI(xsd + "string") | |
val plainLit = (P.single('"')>>literal<< P.single('\"')) | |
val fullLiteral = plainLit ++ (typeFunc | langFunc).optional map { | |
case lexicalForm ++ None => TypedLiteral(lexicalForm) | |
case lexicalForm ++ Some(Left(uriRef)) => TypedLiteral(lexicalForm, uriRef) | |
case lexicalForm ++ Some(Right(lang)) => LangLiteral(lexicalForm, lang) | |
} | |
val typeFunc = (P.word("^^") >> uriRef) map Left.apply | |
val langFunc = (P.word("@") >> lang) map Right.apply | |
val dot = P.single('.') | |
val uriRef = ( P.single('<') >> uriStr << P.single('>')).map(i=>IRI(i)) | |
val pred = uriRef | |
val subject = uriRef | bnode | |
val obj = uriRef | bnode | fullLiteral | |
val nTriple = (subject++(space1>>pred)++(space1>>obj)).map{case s++r++o=> Triple(s,r,o)} << (space>>dot>>space) | |
val comment = P.single('#') >> P.takeWhile(c =>c != '\r' && c != '\n' ) | |
val line = space >> (comment.as(None) | nTriple.map(Some(_)) | P.unit(None) ) | |
/** function that parse NTriples and send results to user in a streaming fashion */ | |
val nTriples = (line.mapResult{ | |
r => | |
r.get match { | |
case Some(t) => r.user.send(t); | |
case None => () | |
} | |
Success(r) | |
} ).delimitIgnore(eoln) | |
/** function that parses NTriples and return result to caller as a list */ | |
val nTriplesList = line.delimit(eoln).map(_.flatten) | |
} | |
object NTriplesParser { | |
val hexChar = Array( '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'); | |
private def hex(c: Char) = { | |
val b = new StringBuilder(6) | |
b.append("\\u"). | |
append(hexChar((c >> 12) & 0xF)). | |
append(hexChar((c >> 8) & 0xF)). | |
append(hexChar((c >> 4) & 0xF)). | |
append(hexChar(c & 0xF)) | |
b | |
} | |
private def hexLong(c: Char) = { | |
val b = new StringBuilder(10) | |
b.append("\\U"). | |
append(hexChar((c >> 28) & 0xF)). | |
append(hexChar((c >> 24) & 0xF)). | |
append(hexChar((c >> 20) & 0xF)). | |
append(hexChar((c >> 16) & 0xF)). | |
append(hexChar((c >> 12) & 0xF)). | |
append(hexChar((c >> 8) & 0xF)). | |
append(hexChar((c >> 4) & 0xF)). | |
append(hexChar(c & 0xF)) | |
b | |
} | |
def toLiteral(str: String) = { | |
val b = new StringBuilder | |
for (c <- str) yield { | |
if (c <= 0x8) b.append(hex(c)) | |
else if (c == 0x9) b.append("\\t") | |
else if (c == 0xA) b.append("\\n") | |
else if (c == 0xB || c == 0xC) b.append(hex(c)) | |
else if (c == 0xD) b.append("\\r") | |
else if (c >= 0xE && c <= 0x1F) b.append(hex(c)) | |
else if (c == 0x20 || c == 0x21) b.append(c) | |
else if (c == 0x22) b.append('\\').append('"') | |
else if (c >= 0x23 && c <= 0x5b) b.append(c) | |
else if (c == 0x5c) b.append('\\').append('\\') | |
else if (c >= 0x5d && c <= 0x7e) b.append(c) | |
else if (c >= 0x7f && c <= 0xffff) b.append(hex(c)) | |
else if (c >= 0x10000 & c <= 0x10FFFF) b.append(hexLong(c)) | |
} | |
b.toString() | |
} | |
} | |
/** | |
* Turtle Parser as specified at http://www.w3.org/TR/turtle/ | |
* | |
* @param m | |
* @param P | |
* @tparam M | |
* @tparam F | |
* @tparam E | |
* @tparam X | |
* @tparam U | |
*/ | |
class TurtleParser[M <: Module,F,E,X,U <: ListenerAgent[Any]](val m: M, val P: Parsers[F, Char, E, X, U]) { | |
import TurtleParser._ | |
import P.++ | |
/** Parses the single token given that matches the function */ | |
def single(isC: Char => Boolean ): P.Parser[Char] = P.any mapResult (s => | |
s.status.flatMap(i => if (isC(i) ) Success(i) else Failure(P.err.single(i, s.position)))) | |
val COLON = P.single(':') | |
val PREFIX = P.word("@prefix") | |
val dot = P.single('.') | |
val SP = (P.takeWhile1(c=> " \t\r\n".contains(c),err) | comment ).many1 | |
val comment = P.single('#')>>P.takeWhile(c=> c != '\r' && c != '\n') | |
val hexadecimalChars = "1234567890ABCDEFabcdef" | |
def hex = P.anyOf(hexadecimalChars) | |
val u_CHAR = (P.word("\\u")>> hex++hex++hex++hex) map { | |
case c1++c2++c3++c4 => Integer.parseInt(new String(Array(c1,c2,c3,c4)),16).toChar | |
} | |
val U_CHAR = (P.word("\\U")>> hex++hex++hex++hex++hex++hex++hex++hex) map { | |
case c1++c2++c3++c4++c5++c6++c7++c8 => Integer.parseInt(new String(Array(c1,c2,c3,c4,c5,c6,c7,c8)),16).toChar | |
} | |
val err = (pos: X) =>P.err.single('!',pos) | |
val UCHAR = u_CHAR | U_CHAR | |
val UCHARS = UCHAR.many1.map(_.mkString) | |
val PN_CHARS_BASE = single(pn_chars_simple) | UCHAR | |
val PN_CHARS_U = PN_CHARS_BASE | P.single ('_') | |
val PN_CHARS = single(pn_chars_dot) | UCHAR | |
val PN_CHARS_LAST = single(pn_chars) | UCHAR | |
val PN_PREFIX_1 = PN_CHARS_BASE << COLON | |
val PN_PREFIX_2 = (PN_CHARS_BASE ++ PN_CHARS_LAST << COLON).map{ case a++z=> ""+a+z } | |
val PN_PREFIX_3 = (PN_CHARS_BASE ++ PN_CHARS++PN_CHARS_LAST << COLON).map{ case a++b++z=> ""+a+b+z } | |
val PN_PREFIX_x = ( PN_CHARS_BASE ++ PN_CHARS.many1 ++ PN_CHARS_LAST << COLON ).map{ | |
case c++list++last => c+list.mkString+last | |
} | |
val PNAME_NS = COLON.map(c=>"") | PN_PREFIX_1 | PN_PREFIX_2 | PN_PREFIX_3 | PN_PREFIX_x | |
val IRI_REF = P.single('<')>>(P.takeWhile1(iri_char,err) | UCHARS).many.map(_.mkString)<<P.single ('>') | |
val prefixID = (PREFIX >> SP >> PNAME_NS) ++ (SP>>IRI_REF) | |
val directive = prefixID //| base | |
val statement = ( directive << dot ) //| ( turtleTriples << dot ) | |
} | |
object TurtleParser { | |
val pn_simple_set = List[Pair[Int, Int]](('A'.toInt,'Z'.toInt),('a'.toInt,'z'.toInt), | |
(0x00C0,0x00D6), (0x00D8,0x00F6), (0x00F8,0x02FF), (0x0370,0x037D), | |
(0x037F,0x1FFF), (0x200C,0x200D), (0x2070,0x218F), (0x2C00,0x2FEF), | |
(0x3001,0xD7FF), (0xF900,0xFDCF), (0xFDF0,0xFFFD), (0x10000,0xEFFFF) | |
) | |
val non_iri_chars = Array('<','>','"','{','}','|','^','`','\\') | |
val pn_chars_set = ('0'.toInt,'9'.toInt)::pn_simple_set:::List((0x300,0x36F),(0x203F,0x2040)) | |
def pn_chars_simple(c: Char): Boolean = pn_simple_set.exists(in(_)(c)) | |
def pn_chars_dot(c: Char) = c == '.' || pn_chars(c) | |
def pn_chars(c: Char) = c == '-' || c == '_' || c == 0xB7 || pn_chars_set.exists(in(_)(c)) | |
def iri_char(c: Char) = !( non_iri_chars.contains(c) || in((0,' '.toInt))(c) ) | |
def in(interval: Pair[Int, Int])(c: Char) = c>=interval._1 && c<=interval._2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment