Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
Created September 27, 2012 14:58
Show Gist options
  • Save jelmervdl/3794475 to your computer and use it in GitHub Desktop.
Save jelmervdl/3794475 to your computer and use it in GitHub Desktop.
Query validation against a DTD. Uses xqilla and libxml2.
<?xml version="1.0" encoding="iso-8859-1"?>
<!-- Alpino Dependency Structures DTD -->
<!--
$Id: alpino_ds.dtd,v 1.1 2005-11-25 14:43:27 geertk Exp $
We hebben het toplevel element
alpino_ds
Daaronder komen voor:
sentence (PCDATA)
comments
node
Ik kan niet zeggen dat ik ze allemaal 1x wil zien, maar dat de
volgorde niet uitmaakt. Zie o.a.
http://www.xml.com/pub/a/98/07/dtd/.
Het kan wel in SGML (de "ampersand connector") en met RELAX NG.
We moeten hier kiezen voor of een te ruime optie of we moeten de
volgorde vastleggen:
<!ELEMENT alpino_ds ( (node | sentence | comments)+ )>
of
<!ELEMENT alpino_ds (node, sentence, comments?)>
We kiezen voor de stricte variant.
-->
<!-- Alpino Dependency Structure -->
<!ELEMENT alpino_ds (node, sentence, comments?) >
<!ATTLIST alpino_ds
version NMTOKEN #IMPLIED>
<!-- Node -->
<!-- een knoop van een dependency tree -->
<!ELEMENT node (node*) >
<!--
rel de dependency relatie van de knoop
cat de categorie van de knoop
pos de part of speech tag van de knoop ALPINO-stijl
postag de part of speech tag van de knoop CGN/DCOI/LASSY-stijl
lemma de lemma van de knoop CGN/DCOI/LASSY-stijl
begin de beginpositie van de bijbehorende woordgroep in de gehele zin
end de eindpositie van de bijbehorende woordgroep in de gehele zin
root de stam van het woord ALPINO-stijl
word het woord zoals het in de zin voorkomt
index een co-index om gedeelde structuren mogelijk te maken
id een per zin uniek nummer om de knopen te kunnen identificeren
case naamval van nomina (nom,acc,dat,..)
comparative type complement van comparatives (dan,als,dat,..)
def definitietheid van nomina (def,indef,..)
frame volledige Alpino POS-tag
gen gender van nomina (de,het,..)
infl inflection van adjectieven, determiners
neclass classificatie van namen (org, loc, per, misc)
num getalinformatie van nomina (sg,pl,..)
per persoonsinformatie van nomina (fir,thi,..)
refl reflexiviteit van pronomina (refl)
sc subcategorizatie frame
special vergaarbak voor andere speciale features
wh geeft vraagwoordinformatie voor pronomina (ywh,nwh,rwh)
misschien dat er meerdere #REQUIRED zijn, of dat in sommige
NMTOKEN beter gebruikt kan worden ipv. CDATA.
Het is ook mogelijk een opsomming van de mogelijke waarden te
geven. Voor sommige attributen is dat misschien wenselijk.
-->
<!ATTLIST node
rel (hdf|hd|cmp|sup|su|obj1|pobj1|obj2|se|pc|vc|svp|predc|ld|me|predm|
obcomp|mod|body|det|app|whd|rhd|cnj| crd|nucl|sat|tag|
dp|top|mwp|dlink|--)
#REQUIRED
cat (smain|np|ppart|ppres|pp|ssub|inf|cp|du|ap|advp|ti|rel|whrel|whsub|conj|
whq|oti|ahi|detp|sv1|svan|mwu|top)
#IMPLIED
pos CDATA #IMPLIED
postag CDATA #IMPLIED
lemma CDATA #IMPLIED
begin CDATA #IMPLIED
end CDATA #IMPLIED
root CDATA #IMPLIED
word CDATA #IMPLIED
index CDATA #IMPLIED
id CDATA #IMPLIED
pt (let|spec|bw|vg|lid|vnw|tw|ww|adj|n|tsw|vz) #IMPLIED
dial (dial) #IMPLIED
ntype (soort|eigen) #IMPLIED
getal (getal|ev|mv) #IMPLIED
graad (basis|comp|sup|dim) #IMPLIED
genus (genus|zijd|masc|fem|onz) #IMPLIED
naamval (stan|nomin|obl|bijz|gen|dat) #IMPLIED
positie (prenom|nom|postnom|vrij) #IMPLIED
buiging (zonder|met-e|met-s) #IMPLIED
getal-n (zonder-n|mv-n) #IMPLIED
wvorm (pv|inf|od|vd) #IMPLIED
pvtijd (tgw|verl|conj) #IMPLIED
pvagr (ev|mv|met-t) #IMPLIED
numtype (hoofd|rang) #IMPLIED
vwtype (pr|pers|refl|recip|bez|vb|vrag|betr|excl|aanw|onbep) #IMPLIED
pdtype (pron|adv-pron|det|grad) #IMPLIED
persoon (persoon|1|2|2v|2b|3|3p|3m|3v|3o) #IMPLIED
status (vol|red|nadr) #IMPLIED
npagr (agr|evon|rest|evz|mv|agr3|evmo|rest3|evf) #IMPLIED
lwtype (bep|onbep) #IMPLIED
vztype (init|versm|fin) #IMPLIED
conjtype (neven|onder) #IMPLIED
spectype
(afgebr|onverst|vreemd|deeleigen|meta|comment|achter|afk|symb|enof)
#IMPLIED
mwu_root CDATA #IMPLIED
case CDATA #IMPLIED
comparative CDATA #IMPLIED
def CDATA #IMPLIED
frame CDATA #IMPLIED
gen CDATA #IMPLIED
infl CDATA #IMPLIED
neclass CDATA #IMPLIED
num CDATA #IMPLIED
per CDATA #IMPLIED
refl CDATA #IMPLIED
sc CDATA #IMPLIED
special CDATA #IMPLIED
wh CDATA #IMPLIED
>
<!-- Sentence -->
<!-- de tekst van de gehele zin -->
<!ELEMENT sentence (#PCDATA) >
<!-- Comments -->
<!-- een of meerdere comment elementen -->
<!ELEMENT comments (comment+) >
<!-- Comment -->
<!-- een commentaar entry -->
<!ELEMENT comment (#PCDATA) >
#!/bin/sh
g++ -Wall \
-I/Users/jelmer/workspace/rug/compling/dbxml-2.5.16/install/include \
-I/usr/include/libxml2 \
-L/Users/jelmer/workspace/rug/compling/dbxml-2.5.16/install/lib \
-lxqilla \
-lxerces-c \
-lxml2 \
-o ./queryvalidator \
./main.cpp
#include <iostream>
#include <sstream>
#include <map>
#include <xqilla/utils/XQillaPlatformUtils.hpp>
#include <xqilla/xqilla-simple.hpp>
#include <xqilla/ast/ASTNode.hpp>
#include <xqilla/ast/XQNav.hpp>
#include <xqilla/ast/XQAtomize.hpp>
#include <xqilla/ast/XQDocumentOrder.hpp>
#include <xqilla/ast/XQOperator.hpp>
#include <xqilla/ast/XQPredicate.hpp>
#include <xqilla/axis/NodeTest.hpp>
#include <xercesc/util/XMLString.hpp>
#include <libxml/tree.h>
using namespace std;
using namespace XERCES_CPP_NAMESPACE;
namespace {
struct Globals {
Globals();
virtual ~Globals();
};
static Globals s_globals;
Globals::Globals() {
XQillaPlatformUtils::initialize();
}
Globals::~Globals() {
XQillaPlatformUtils::terminate();
}
static XQilla s_xqilla;
}
class Scope
{
public:
Scope(Scope const *parent = 0)
:
d_parent(parent)
{
//
}
void setNodeName(std::string const &name)
{
d_nodeName = name;
}
std::string const &nodeName() const
{
return d_nodeName;
}
std::string path() const
{
stringstream ss;
Scope const *scope = this;
while (scope)
{
ss << ">" << scope->nodeName();
scope = scope->d_parent;
}
return ss.str();
}
private:
std::string d_nodeName;
Scope const *d_parent;
};
class DTD
{
public:
DTD(map<string, vector<string> > const &elements)
:
d_map(elements)
{
//
}
bool allowElement(string const &element, string const &parent) const
{
return d_map.find(element) != d_map.end();
}
bool allowAttribute(string const &attribute, string const &element) const
{
ElementMap::const_iterator pos = d_map.find(element);
if (pos == d_map.end())
return false;
return find(pos->second.begin(), pos->second.end(), attribute) != pos->second.end();
}
private:
typedef map<string,vector<string> > ElementMap;
ElementMap d_map;
};
void inspect(ASTNode *node, Scope *scope, DTD const &dtd)
{
switch (node->getType())
{
case ASTNode::NAVIGATION:
{
cout << "Type NAVIGATION" << endl;
XQNav *nav = reinterpret_cast<XQNav*>(node);
XQNav::Steps steps(nav->getSteps());
for (XQNav::Steps::const_iterator it = steps.begin(); it != steps.end(); ++it)
inspect(it->step, scope, dtd);
break;
}
case ASTNode::LITERAL:
cout << "Type LITERAL" << endl;
break;
case ASTNode::NUMERIC_LITERAL:
cout << "Type NUMERIC_LITERAL" << endl;
break;
case ASTNode::QNAME_LITERAL:
cout << "Type QNAME_LITERAL" << endl;
break;
case ASTNode::SEQUENCE:
cout << "Type SEQUENCE" << endl;
break;
case ASTNode::FUNCTION:
cout << "Type FUNCTION" << endl;
break;
case ASTNode::VARIABLE:
cout << "Type VARIABLE" << endl;
break;
case ASTNode::STEP:
{
cout << "Type STEP" << endl;
XQStep *step = reinterpret_cast<XQStep*>(node);
NodeTest *test = step->getNodeTest();
char *nodeType = XMLString::transcode(test->getNodeType());
char *nodeName = XMLString::transcode(test->getNodeName());
if (strcmp(nodeType, "element") == 0)
{
// Test to see if this element is allowed here
if (!dtd.allowElement(nodeName, scope->nodeName()))
cerr << "The element " << nodeName
<< " is not allowed inside "
<< scope->nodeName() << endl;
scope->setNodeName(nodeName);
}
else if (strcmp(nodeType, "attribute") == 0)
{
if (!dtd.allowAttribute(nodeName, scope->nodeName()))
cerr << "The attribute " << nodeName
<< " is not allowed inside "
<< scope->nodeName() << endl;
}
delete[] nodeType;
delete[] nodeName;
break;
}
case ASTNode::IF:
cout << "Type IF" << endl;
break;
case ASTNode::INSTANCE_OF:
cout << "Type INSTANCE_OF" << endl;
break;
case ASTNode::CASTABLE_AS:
cout << "Type CASTABLE_AS" << endl;
break;
case ASTNode::CAST_AS:
cout << "Type CAST_AS" << endl;
break;
case ASTNode::TREAT_AS:
cout << "Type TREAT_AS" << endl;
break;
case ASTNode::OPERATOR:
{
cout << "Type OPERATOR" << endl;
XQOperator *op = reinterpret_cast<XQOperator *>(node);
VectorOfASTNodes const &args(op->getArguments());
for (VectorOfASTNodes::const_iterator it = args.begin();
it != args.end(); ++it)
inspect(*it, scope, dtd);
break;
}
case ASTNode::CONTEXT_ITEM:
cout << "Type CONTEXT_ITEM" << endl;
break;
case ASTNode::DOM_CONSTRUCTOR:
cout << "Type DOM_CONSTRUCTOR" << endl;
break;
case ASTNode::QUANTIFIED:
cout << "Type QUANTIFIED" << endl;
break;
case ASTNode::TYPESWITCH:
cout << "Type TYPESWITCH" << endl;
break;
case ASTNode::VALIDATE:
cout << "Type VALIDATE" << endl;
break;
case ASTNode::FUNCTION_CALL:
cout << "Type FUNCTION_CALL" << endl;
break;
case ASTNode::USER_FUNCTION:
cout << "Type USER_FUNCTION" << endl;
break;
case ASTNode::ORDERING_CHANGE:
cout << "Type ORDERING_CHANGE" << endl;
break;
case ASTNode::XPATH1_CONVERT:
cout << "Type XPATH1_CONVERT" << endl;
break;
case ASTNode::PROMOTE_UNTYPED:
cout << "Type PROMOTE_UNTYPED" << endl;
break;
case ASTNode::PROMOTE_NUMERIC:
cout << "Type PROMOTE_NUMERIC" << endl;
break;
case ASTNode::PROMOTE_ANY_URI:
cout << "Type PROMOTE_ANY_URI" << endl;
break;
case ASTNode::DOCUMENT_ORDER:
{
cout << "Type DOCUMENT_ORDER" << endl;
XQDocumentOrder *docOrder = reinterpret_cast<XQDocumentOrder*>(node);
inspect(docOrder->getExpression(), scope, dtd);
break;
}
case ASTNode::PREDICATE:
{
cout << "Type PREDICATE" << endl;
XQPredicate *predicate = reinterpret_cast<XQPredicate*>(node);
Scope *stepScope = new Scope(scope);
inspect(predicate->getExpression(), stepScope, dtd);
inspect(predicate->getPredicate(), stepScope, dtd);
break;
}
case ASTNode::ATOMIZE:
{
cout << "Type ATOMIZE" << endl;
XQAtomize *atomize = reinterpret_cast<XQAtomize*>(node);
inspect(atomize->getExpression(), scope, dtd);
break;
}
case ASTNode::EBV:
cout << "Type EBV" << endl;
break;
case ASTNode::FTCONTAINS:
cout << "Type FTCONTAINS" << endl;
break;
case ASTNode::UDELETE:
cout << "Type UDELETE" << endl;
break;
case ASTNode::URENAME:
cout << "Type URENAME" << endl;
break;
case ASTNode::UREPLACE:
cout << "Type UREPLACE" << endl;
break;
case ASTNode::UREPLACE_VALUE_OF:
cout << "Type UREPLACE_VALUE_OF" << endl;
break;
case ASTNode::UTRANSFORM:
cout << "Type UTRANSFORM" << endl;
break;
case ASTNode::UINSERT_AS_FIRST:
cout << "Type UINSERT_AS_FIRST" << endl;
break;
case ASTNode::UINSERT_AS_LAST:
cout << "Type UINSERT_AS_LAST" << endl;
break;
case ASTNode::UINSERT_INTO:
cout << "Type UINSERT_INTO" << endl;
break;
case ASTNode::UINSERT_AFTER:
cout << "Type UINSERT_AFTER" << endl;
break;
case ASTNode::UINSERT_BEFORE:
cout << "Type UINSERT_BEFORE" << endl;
break;
case ASTNode::UAPPLY_UPDATES:
cout << "Type UAPPLY_UPDATES" << endl;
break;
case ASTNode::NAME_EXPRESSION:
cout << "Type NAME_EXPRESSION" << endl;
break;
case ASTNode::CONTENT_SEQUENCE:
cout << "Type CONTENT_SEQUENCE" << endl;
break;
case ASTNode::DIRECT_NAME:
cout << "Type DIRECT_NAME" << endl;
break;
case ASTNode::RETURN:
cout << "Type RETURN" << endl;
break;
case ASTNode::NAMESPACE_BINDING:
cout << "Type NAMESPACE_BINDING" << endl;
break;
case ASTNode::FUNCTION_CONVERSION:
cout << "Type FUNCTION_CONVERSION" << endl;
break;
case ASTNode::SIMPLE_CONTENT:
cout << "Type SIMPLE_CONTENT" << endl;
break;
case ASTNode::ANALYZE_STRING:
cout << "Type ANALYZE_STRING" << endl;
break;
case ASTNode::CALL_TEMPLATE:
cout << "Type CALL_TEMPLATE" << endl;
break;
case ASTNode::APPLY_TEMPLATES:
cout << "Type APPLY_TEMPLATES" << endl;
break;
case ASTNode::INLINE_FUNCTION:
cout << "Type INLINE_FUNCTION" << endl;
break;
case ASTNode::FUNCTION_REF:
cout << "Type FUNCTION_REF" << endl;
break;
case ASTNode::FUNCTION_DEREF:
cout << "Type FUNCTION_DEREF" << endl;
break;
case ASTNode::COPY_OF:
cout << "Type COPY_OF" << endl;
break;
case ASTNode::COPY:
cout << "Type COPY" << endl;
break;
case ASTNode::MAP:
cout << "Type MAP" << endl;
break;
case ASTNode::DEBUG_HOOK:
cout << "Type DEBUG_HOOK" << endl;
break;
}
}
void scanElement(void *payload, void *data, xmlChar *name)
{
xmlElement *elem = reinterpret_cast<xmlElement*>(payload);
map<string,vector<string> > *elements = reinterpret_cast<map<string,vector<string> > *>(data);
for (xmlAttributePtr attr = elem->attributes; attr != NULL; attr = reinterpret_cast<xmlAttributePtr>(attr->next))
(*elements)[reinterpret_cast<char const *>(elem->name)].push_back(reinterpret_cast<char const *>(attr->name));
}
bool parseDtd(string const &file, map<string, vector<string> > &elements)
{
xmlParserInputBufferPtr input = xmlParserInputBufferCreateFilename(file.c_str(), XML_CHAR_ENCODING_8859_1);
xmlDtdPtr dtd = xmlIOParseDTD(NULL, input, XML_CHAR_ENCODING_8859_1);
if (dtd == NULL)
{
cerr << "Could not parse DTD" << endl;
return false;
}
if (dtd->elements == NULL)
{
cerr << "DTD hashtable has no elements" << endl;
return false;
}
xmlHashScan(reinterpret_cast<xmlHashTablePtr>(dtd->elements), scanElement, &elements);
return true;
}
int main(int argc, char** argv)
{
string queryString(argc > 1 ? argv[1] : "//node[@rel='su' and string(@begin) = 4]");
DynamicContext *ctx = s_xqilla.createContext(XQilla::XPATH2);
ctx->setXPath1CompatibilityMode(true);
XQQuery *query = s_xqilla.parse(X(queryString.c_str()), ctx);
cout << query->getQueryPlan() << endl;
ASTNode *root = query->getQueryBody();
Scope *rootScope = new Scope();
rootScope->setNodeName("[document root]");
map<string,vector<string> > elements;
parseDtd("alpino_ds.dtd", elements);
DTD alpinoDtd(elements);
inspect(root, rootScope, alpinoDtd);
cout << "Done." << endl;
delete rootScope;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment