Created
September 27, 2012 14:58
-
-
Save jelmervdl/3794475 to your computer and use it in GitHub Desktop.
Query validation against a DTD. Uses xqilla and libxml2.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="iso-8859-1"?> | |
<!-- Alpino Dependency Structures DTD --> | |
<!-- | |
$Id: alpino_ds.dtd,v 1.1 2005-11-25 14:43:27 geertk Exp $ | |
We hebben het toplevel element | |
alpino_ds | |
Daaronder komen voor: | |
sentence (PCDATA) | |
comments | |
node | |
Ik kan niet zeggen dat ik ze allemaal 1x wil zien, maar dat de | |
volgorde niet uitmaakt. Zie o.a. | |
http://www.xml.com/pub/a/98/07/dtd/. | |
Het kan wel in SGML (de "ampersand connector") en met RELAX NG. | |
We moeten hier kiezen voor of een te ruime optie of we moeten de | |
volgorde vastleggen: | |
<!ELEMENT alpino_ds ( (node | sentence | comments)+ )> | |
of | |
<!ELEMENT alpino_ds (node, sentence, comments?)> | |
We kiezen voor de stricte variant. | |
--> | |
<!-- Alpino Dependency Structure --> | |
<!ELEMENT alpino_ds (node, sentence, comments?) > | |
<!ATTLIST alpino_ds | |
version NMTOKEN #IMPLIED> | |
<!-- Node --> | |
<!-- een knoop van een dependency tree --> | |
<!ELEMENT node (node*) > | |
<!-- | |
rel de dependency relatie van de knoop | |
cat de categorie van de knoop | |
pos de part of speech tag van de knoop ALPINO-stijl | |
postag de part of speech tag van de knoop CGN/DCOI/LASSY-stijl | |
lemma de lemma van de knoop CGN/DCOI/LASSY-stijl | |
begin de beginpositie van de bijbehorende woordgroep in de gehele zin | |
end de eindpositie van de bijbehorende woordgroep in de gehele zin | |
root de stam van het woord ALPINO-stijl | |
word het woord zoals het in de zin voorkomt | |
index een co-index om gedeelde structuren mogelijk te maken | |
id een per zin uniek nummer om de knopen te kunnen identificeren | |
case naamval van nomina (nom,acc,dat,..) | |
comparative type complement van comparatives (dan,als,dat,..) | |
def definitietheid van nomina (def,indef,..) | |
frame volledige Alpino POS-tag | |
gen gender van nomina (de,het,..) | |
infl inflection van adjectieven, determiners | |
neclass classificatie van namen (org, loc, per, misc) | |
num getalinformatie van nomina (sg,pl,..) | |
per persoonsinformatie van nomina (fir,thi,..) | |
refl reflexiviteit van pronomina (refl) | |
sc subcategorizatie frame | |
special vergaarbak voor andere speciale features | |
wh geeft vraagwoordinformatie voor pronomina (ywh,nwh,rwh) | |
misschien dat er meerdere #REQUIRED zijn, of dat in sommige | |
NMTOKEN beter gebruikt kan worden ipv. CDATA. | |
Het is ook mogelijk een opsomming van de mogelijke waarden te | |
geven. Voor sommige attributen is dat misschien wenselijk. | |
--> | |
<!ATTLIST node | |
rel (hdf|hd|cmp|sup|su|obj1|pobj1|obj2|se|pc|vc|svp|predc|ld|me|predm| | |
obcomp|mod|body|det|app|whd|rhd|cnj| crd|nucl|sat|tag| | |
dp|top|mwp|dlink|--) | |
#REQUIRED | |
cat (smain|np|ppart|ppres|pp|ssub|inf|cp|du|ap|advp|ti|rel|whrel|whsub|conj| | |
whq|oti|ahi|detp|sv1|svan|mwu|top) | |
#IMPLIED | |
pos CDATA #IMPLIED | |
postag CDATA #IMPLIED | |
lemma CDATA #IMPLIED | |
begin CDATA #IMPLIED | |
end CDATA #IMPLIED | |
root CDATA #IMPLIED | |
word CDATA #IMPLIED | |
index CDATA #IMPLIED | |
id CDATA #IMPLIED | |
pt (let|spec|bw|vg|lid|vnw|tw|ww|adj|n|tsw|vz) #IMPLIED | |
dial (dial) #IMPLIED | |
ntype (soort|eigen) #IMPLIED | |
getal (getal|ev|mv) #IMPLIED | |
graad (basis|comp|sup|dim) #IMPLIED | |
genus (genus|zijd|masc|fem|onz) #IMPLIED | |
naamval (stan|nomin|obl|bijz|gen|dat) #IMPLIED | |
positie (prenom|nom|postnom|vrij) #IMPLIED | |
buiging (zonder|met-e|met-s) #IMPLIED | |
getal-n (zonder-n|mv-n) #IMPLIED | |
wvorm (pv|inf|od|vd) #IMPLIED | |
pvtijd (tgw|verl|conj) #IMPLIED | |
pvagr (ev|mv|met-t) #IMPLIED | |
numtype (hoofd|rang) #IMPLIED | |
vwtype (pr|pers|refl|recip|bez|vb|vrag|betr|excl|aanw|onbep) #IMPLIED | |
pdtype (pron|adv-pron|det|grad) #IMPLIED | |
persoon (persoon|1|2|2v|2b|3|3p|3m|3v|3o) #IMPLIED | |
status (vol|red|nadr) #IMPLIED | |
npagr (agr|evon|rest|evz|mv|agr3|evmo|rest3|evf) #IMPLIED | |
lwtype (bep|onbep) #IMPLIED | |
vztype (init|versm|fin) #IMPLIED | |
conjtype (neven|onder) #IMPLIED | |
spectype | |
(afgebr|onverst|vreemd|deeleigen|meta|comment|achter|afk|symb|enof) | |
#IMPLIED | |
mwu_root CDATA #IMPLIED | |
case CDATA #IMPLIED | |
comparative CDATA #IMPLIED | |
def CDATA #IMPLIED | |
frame CDATA #IMPLIED | |
gen CDATA #IMPLIED | |
infl CDATA #IMPLIED | |
neclass CDATA #IMPLIED | |
num CDATA #IMPLIED | |
per CDATA #IMPLIED | |
refl CDATA #IMPLIED | |
sc CDATA #IMPLIED | |
special CDATA #IMPLIED | |
wh CDATA #IMPLIED | |
> | |
<!-- Sentence --> | |
<!-- de tekst van de gehele zin --> | |
<!ELEMENT sentence (#PCDATA) > | |
<!-- Comments --> | |
<!-- een of meerdere comment elementen --> | |
<!ELEMENT comments (comment+) > | |
<!-- Comment --> | |
<!-- een commentaar entry --> | |
<!ELEMENT comment (#PCDATA) > |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
g++ -Wall \ | |
-I/Users/jelmer/workspace/rug/compling/dbxml-2.5.16/install/include \ | |
-I/usr/include/libxml2 \ | |
-L/Users/jelmer/workspace/rug/compling/dbxml-2.5.16/install/lib \ | |
-lxqilla \ | |
-lxerces-c \ | |
-lxml2 \ | |
-o ./queryvalidator \ | |
./main.cpp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <sstream> | |
#include <map> | |
#include <xqilla/utils/XQillaPlatformUtils.hpp> | |
#include <xqilla/xqilla-simple.hpp> | |
#include <xqilla/ast/ASTNode.hpp> | |
#include <xqilla/ast/XQNav.hpp> | |
#include <xqilla/ast/XQAtomize.hpp> | |
#include <xqilla/ast/XQDocumentOrder.hpp> | |
#include <xqilla/ast/XQOperator.hpp> | |
#include <xqilla/ast/XQPredicate.hpp> | |
#include <xqilla/axis/NodeTest.hpp> | |
#include <xercesc/util/XMLString.hpp> | |
#include <libxml/tree.h> | |
using namespace std; | |
using namespace XERCES_CPP_NAMESPACE; | |
namespace { | |
struct Globals { | |
Globals(); | |
virtual ~Globals(); | |
}; | |
static Globals s_globals; | |
Globals::Globals() { | |
XQillaPlatformUtils::initialize(); | |
} | |
Globals::~Globals() { | |
XQillaPlatformUtils::terminate(); | |
} | |
static XQilla s_xqilla; | |
} | |
class Scope | |
{ | |
public: | |
Scope(Scope const *parent = 0) | |
: | |
d_parent(parent) | |
{ | |
// | |
} | |
void setNodeName(std::string const &name) | |
{ | |
d_nodeName = name; | |
} | |
std::string const &nodeName() const | |
{ | |
return d_nodeName; | |
} | |
std::string path() const | |
{ | |
stringstream ss; | |
Scope const *scope = this; | |
while (scope) | |
{ | |
ss << ">" << scope->nodeName(); | |
scope = scope->d_parent; | |
} | |
return ss.str(); | |
} | |
private: | |
std::string d_nodeName; | |
Scope const *d_parent; | |
}; | |
class DTD | |
{ | |
public: | |
DTD(map<string, vector<string> > const &elements) | |
: | |
d_map(elements) | |
{ | |
// | |
} | |
bool allowElement(string const &element, string const &parent) const | |
{ | |
return d_map.find(element) != d_map.end(); | |
} | |
bool allowAttribute(string const &attribute, string const &element) const | |
{ | |
ElementMap::const_iterator pos = d_map.find(element); | |
if (pos == d_map.end()) | |
return false; | |
return find(pos->second.begin(), pos->second.end(), attribute) != pos->second.end(); | |
} | |
private: | |
typedef map<string,vector<string> > ElementMap; | |
ElementMap d_map; | |
}; | |
void inspect(ASTNode *node, Scope *scope, DTD const &dtd) | |
{ | |
switch (node->getType()) | |
{ | |
case ASTNode::NAVIGATION: | |
{ | |
cout << "Type NAVIGATION" << endl; | |
XQNav *nav = reinterpret_cast<XQNav*>(node); | |
XQNav::Steps steps(nav->getSteps()); | |
for (XQNav::Steps::const_iterator it = steps.begin(); it != steps.end(); ++it) | |
inspect(it->step, scope, dtd); | |
break; | |
} | |
case ASTNode::LITERAL: | |
cout << "Type LITERAL" << endl; | |
break; | |
case ASTNode::NUMERIC_LITERAL: | |
cout << "Type NUMERIC_LITERAL" << endl; | |
break; | |
case ASTNode::QNAME_LITERAL: | |
cout << "Type QNAME_LITERAL" << endl; | |
break; | |
case ASTNode::SEQUENCE: | |
cout << "Type SEQUENCE" << endl; | |
break; | |
case ASTNode::FUNCTION: | |
cout << "Type FUNCTION" << endl; | |
break; | |
case ASTNode::VARIABLE: | |
cout << "Type VARIABLE" << endl; | |
break; | |
case ASTNode::STEP: | |
{ | |
cout << "Type STEP" << endl; | |
XQStep *step = reinterpret_cast<XQStep*>(node); | |
NodeTest *test = step->getNodeTest(); | |
char *nodeType = XMLString::transcode(test->getNodeType()); | |
char *nodeName = XMLString::transcode(test->getNodeName()); | |
if (strcmp(nodeType, "element") == 0) | |
{ | |
// Test to see if this element is allowed here | |
if (!dtd.allowElement(nodeName, scope->nodeName())) | |
cerr << "The element " << nodeName | |
<< " is not allowed inside " | |
<< scope->nodeName() << endl; | |
scope->setNodeName(nodeName); | |
} | |
else if (strcmp(nodeType, "attribute") == 0) | |
{ | |
if (!dtd.allowAttribute(nodeName, scope->nodeName())) | |
cerr << "The attribute " << nodeName | |
<< " is not allowed inside " | |
<< scope->nodeName() << endl; | |
} | |
delete[] nodeType; | |
delete[] nodeName; | |
break; | |
} | |
case ASTNode::IF: | |
cout << "Type IF" << endl; | |
break; | |
case ASTNode::INSTANCE_OF: | |
cout << "Type INSTANCE_OF" << endl; | |
break; | |
case ASTNode::CASTABLE_AS: | |
cout << "Type CASTABLE_AS" << endl; | |
break; | |
case ASTNode::CAST_AS: | |
cout << "Type CAST_AS" << endl; | |
break; | |
case ASTNode::TREAT_AS: | |
cout << "Type TREAT_AS" << endl; | |
break; | |
case ASTNode::OPERATOR: | |
{ | |
cout << "Type OPERATOR" << endl; | |
XQOperator *op = reinterpret_cast<XQOperator *>(node); | |
VectorOfASTNodes const &args(op->getArguments()); | |
for (VectorOfASTNodes::const_iterator it = args.begin(); | |
it != args.end(); ++it) | |
inspect(*it, scope, dtd); | |
break; | |
} | |
case ASTNode::CONTEXT_ITEM: | |
cout << "Type CONTEXT_ITEM" << endl; | |
break; | |
case ASTNode::DOM_CONSTRUCTOR: | |
cout << "Type DOM_CONSTRUCTOR" << endl; | |
break; | |
case ASTNode::QUANTIFIED: | |
cout << "Type QUANTIFIED" << endl; | |
break; | |
case ASTNode::TYPESWITCH: | |
cout << "Type TYPESWITCH" << endl; | |
break; | |
case ASTNode::VALIDATE: | |
cout << "Type VALIDATE" << endl; | |
break; | |
case ASTNode::FUNCTION_CALL: | |
cout << "Type FUNCTION_CALL" << endl; | |
break; | |
case ASTNode::USER_FUNCTION: | |
cout << "Type USER_FUNCTION" << endl; | |
break; | |
case ASTNode::ORDERING_CHANGE: | |
cout << "Type ORDERING_CHANGE" << endl; | |
break; | |
case ASTNode::XPATH1_CONVERT: | |
cout << "Type XPATH1_CONVERT" << endl; | |
break; | |
case ASTNode::PROMOTE_UNTYPED: | |
cout << "Type PROMOTE_UNTYPED" << endl; | |
break; | |
case ASTNode::PROMOTE_NUMERIC: | |
cout << "Type PROMOTE_NUMERIC" << endl; | |
break; | |
case ASTNode::PROMOTE_ANY_URI: | |
cout << "Type PROMOTE_ANY_URI" << endl; | |
break; | |
case ASTNode::DOCUMENT_ORDER: | |
{ | |
cout << "Type DOCUMENT_ORDER" << endl; | |
XQDocumentOrder *docOrder = reinterpret_cast<XQDocumentOrder*>(node); | |
inspect(docOrder->getExpression(), scope, dtd); | |
break; | |
} | |
case ASTNode::PREDICATE: | |
{ | |
cout << "Type PREDICATE" << endl; | |
XQPredicate *predicate = reinterpret_cast<XQPredicate*>(node); | |
Scope *stepScope = new Scope(scope); | |
inspect(predicate->getExpression(), stepScope, dtd); | |
inspect(predicate->getPredicate(), stepScope, dtd); | |
break; | |
} | |
case ASTNode::ATOMIZE: | |
{ | |
cout << "Type ATOMIZE" << endl; | |
XQAtomize *atomize = reinterpret_cast<XQAtomize*>(node); | |
inspect(atomize->getExpression(), scope, dtd); | |
break; | |
} | |
case ASTNode::EBV: | |
cout << "Type EBV" << endl; | |
break; | |
case ASTNode::FTCONTAINS: | |
cout << "Type FTCONTAINS" << endl; | |
break; | |
case ASTNode::UDELETE: | |
cout << "Type UDELETE" << endl; | |
break; | |
case ASTNode::URENAME: | |
cout << "Type URENAME" << endl; | |
break; | |
case ASTNode::UREPLACE: | |
cout << "Type UREPLACE" << endl; | |
break; | |
case ASTNode::UREPLACE_VALUE_OF: | |
cout << "Type UREPLACE_VALUE_OF" << endl; | |
break; | |
case ASTNode::UTRANSFORM: | |
cout << "Type UTRANSFORM" << endl; | |
break; | |
case ASTNode::UINSERT_AS_FIRST: | |
cout << "Type UINSERT_AS_FIRST" << endl; | |
break; | |
case ASTNode::UINSERT_AS_LAST: | |
cout << "Type UINSERT_AS_LAST" << endl; | |
break; | |
case ASTNode::UINSERT_INTO: | |
cout << "Type UINSERT_INTO" << endl; | |
break; | |
case ASTNode::UINSERT_AFTER: | |
cout << "Type UINSERT_AFTER" << endl; | |
break; | |
case ASTNode::UINSERT_BEFORE: | |
cout << "Type UINSERT_BEFORE" << endl; | |
break; | |
case ASTNode::UAPPLY_UPDATES: | |
cout << "Type UAPPLY_UPDATES" << endl; | |
break; | |
case ASTNode::NAME_EXPRESSION: | |
cout << "Type NAME_EXPRESSION" << endl; | |
break; | |
case ASTNode::CONTENT_SEQUENCE: | |
cout << "Type CONTENT_SEQUENCE" << endl; | |
break; | |
case ASTNode::DIRECT_NAME: | |
cout << "Type DIRECT_NAME" << endl; | |
break; | |
case ASTNode::RETURN: | |
cout << "Type RETURN" << endl; | |
break; | |
case ASTNode::NAMESPACE_BINDING: | |
cout << "Type NAMESPACE_BINDING" << endl; | |
break; | |
case ASTNode::FUNCTION_CONVERSION: | |
cout << "Type FUNCTION_CONVERSION" << endl; | |
break; | |
case ASTNode::SIMPLE_CONTENT: | |
cout << "Type SIMPLE_CONTENT" << endl; | |
break; | |
case ASTNode::ANALYZE_STRING: | |
cout << "Type ANALYZE_STRING" << endl; | |
break; | |
case ASTNode::CALL_TEMPLATE: | |
cout << "Type CALL_TEMPLATE" << endl; | |
break; | |
case ASTNode::APPLY_TEMPLATES: | |
cout << "Type APPLY_TEMPLATES" << endl; | |
break; | |
case ASTNode::INLINE_FUNCTION: | |
cout << "Type INLINE_FUNCTION" << endl; | |
break; | |
case ASTNode::FUNCTION_REF: | |
cout << "Type FUNCTION_REF" << endl; | |
break; | |
case ASTNode::FUNCTION_DEREF: | |
cout << "Type FUNCTION_DEREF" << endl; | |
break; | |
case ASTNode::COPY_OF: | |
cout << "Type COPY_OF" << endl; | |
break; | |
case ASTNode::COPY: | |
cout << "Type COPY" << endl; | |
break; | |
case ASTNode::MAP: | |
cout << "Type MAP" << endl; | |
break; | |
case ASTNode::DEBUG_HOOK: | |
cout << "Type DEBUG_HOOK" << endl; | |
break; | |
} | |
} | |
void scanElement(void *payload, void *data, xmlChar *name) | |
{ | |
xmlElement *elem = reinterpret_cast<xmlElement*>(payload); | |
map<string,vector<string> > *elements = reinterpret_cast<map<string,vector<string> > *>(data); | |
for (xmlAttributePtr attr = elem->attributes; attr != NULL; attr = reinterpret_cast<xmlAttributePtr>(attr->next)) | |
(*elements)[reinterpret_cast<char const *>(elem->name)].push_back(reinterpret_cast<char const *>(attr->name)); | |
} | |
bool parseDtd(string const &file, map<string, vector<string> > &elements) | |
{ | |
xmlParserInputBufferPtr input = xmlParserInputBufferCreateFilename(file.c_str(), XML_CHAR_ENCODING_8859_1); | |
xmlDtdPtr dtd = xmlIOParseDTD(NULL, input, XML_CHAR_ENCODING_8859_1); | |
if (dtd == NULL) | |
{ | |
cerr << "Could not parse DTD" << endl; | |
return false; | |
} | |
if (dtd->elements == NULL) | |
{ | |
cerr << "DTD hashtable has no elements" << endl; | |
return false; | |
} | |
xmlHashScan(reinterpret_cast<xmlHashTablePtr>(dtd->elements), scanElement, &elements); | |
return true; | |
} | |
int main(int argc, char** argv) | |
{ | |
string queryString(argc > 1 ? argv[1] : "//node[@rel='su' and string(@begin) = 4]"); | |
DynamicContext *ctx = s_xqilla.createContext(XQilla::XPATH2); | |
ctx->setXPath1CompatibilityMode(true); | |
XQQuery *query = s_xqilla.parse(X(queryString.c_str()), ctx); | |
cout << query->getQueryPlan() << endl; | |
ASTNode *root = query->getQueryBody(); | |
Scope *rootScope = new Scope(); | |
rootScope->setNodeName("[document root]"); | |
map<string,vector<string> > elements; | |
parseDtd("alpino_ds.dtd", elements); | |
DTD alpinoDtd(elements); | |
inspect(root, rootScope, alpinoDtd); | |
cout << "Done." << endl; | |
delete rootScope; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment