Skip to content

Instantly share code, notes, and snippets.

@wsalesky
Forked from joewiz/highlight-matches.xq
Last active September 2, 2015 17:24
Show Gist options
  • Save wsalesky/03b8637f9016e13aa482 to your computer and use it in GitHub Desktop.
Save wsalesky/03b8637f9016e13aa482 to your computer and use it in GitHub Desktop.
Repurpose @joewiz highlight-match.xql to do primitive NER on Syriaca.org data. Matches against Syriaca.org English place names. XQuery highlights regex pattern matches in XML while preserving node structure, with XQuery
xquery version "3.0";
declare namespace tei = "http://www.tei-c.org/ns/1.0";
declare namespace fn="http://www.w3.org/2005/xpath-functions";
(:
: A simple addaptation of @joewiz highlight-match.xql [https://gist.github.com/joewiz/5937897] for POC of simple NER based on predefined list.
: Allow placeName recognition against a pre determined list of names, in this case uses place names in https://github.com/srophe/srophe-app-data
: Names a wrapped in a placeName tag with a @ref attribute linking them to the authority record in https://github.com/srophe/srophe-app-data
:)
(: Search within $nodes for matches to a regular expression $pattern and apply a $highlight function :)
declare function local:highlight-matches($nodes as node()*, $pattern as xs:string*, $highlight as function(xs:string) as item()* ) {
for $node in $nodes
return
typeswitch ( $node )
(: Does not wrap names found within existing placeNames :)
case element(tei:placeName) return
$node
case element() return
(: Use QName to preserve namespaces :)
element { QName(namespace-uri($node), local-name($node)) } { $node/@*, local:highlight-matches($node/node(), $pattern, $highlight) }
case text() return
let $normalized := replace($node, '\s+', ' ')
for $segment in analyze-string($normalized, $pattern)/node()
return
if ($segment instance of element(fn:match)) then
$highlight($segment/string())
else
$segment/string()
case document-node() return
document { local:highlight-matches($node/node(), $pattern, $highlight) }
default return
$node
};
(: Gets URI for place record(s) with matching strings, english names only for now :)
declare function local:get-id($string as xs:string) as xs:string*{
let $id := collection('PATH-TO-PLACE-RECS')//tei:placeName[@xml:lang='en'][. = $string]/following-sibling::tei:idno[@type='URI'][starts-with(.,'http://syriaca.org')]
return $id/text()
};
(: Test TEI record :)
let $node :=
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:syriaca="http://syriaca.org" xml:lang="en">
<body>
<listPerson>
<person xml:id="saint-1094" ana="#syriaca-saint">
<note>Fiey provides the following bibliographic citations: <quote source="#bibl1094-1">Fiey, "Bar 'Eta", 10-14. BSO, I, 4- 5 (J. Habb).</quote>
</note>
<note>Testing another name Babai of Nisibis this one with 3 words</note>
<note>Testing names <placeName>Ayn Zarba</placeName></note>
<note type="abstract">Abā was a Bishop of Nineveh who was martyred under shah Shapur II.</note>
</person>
</listPerson>
</body>
</TEI>
(: List of names to be matched against in local:highlight-matches() :)
(: Not a lot of wiggle room in names, must be pretty much an exact match, can be a problem with different variations in transliteration re: diacritics :)
(: Names are sorted by string-length to ensure the best possible match, as regex will match on the first matching pattern :)
let $pattern :=
string-join(
for $placeName in collection('PATH-TO-PLACE-RECS')//tei:placeName[@xml:lang='en']
order by string-length($placeName) descending
return $placeName/text(),'|')
(: Wrap matches in placeName with correct URI in @ref:)
let $highlight := function($string as xs:string) { <placeName ref="{local:get-id($string)}">{$string}</placeName> }
(: Return result :)
return local:highlight-matches($node, $pattern, $highlight)
<TEI xml:lang="en">
<body>
<listPerson>
<person xml:id="saint-1094" ana="#syriaca-saint">
<note>Fiey provides the following bibliographic citations:
<quote source="#bibl1094-1">Fiey, "Bar 'Eta", 10-14. BSO, I, 4- 5 (J. Habb).</quote>
</note>
<note>Testing another name
<placeName ref="http://syriaca.org/place/609">Babai of Nisibis</placeName>
this one with 3 words</note>
<note>Testing names
<placeName xmlns="http://www.tei-c.org/ns/1.0">Ayn Zarba</placeName>
</note>
<note type="abstract">Abā was a Bishop of
<placeName ref="http://syriaca.org/place/2350 http://syriaca.org/place/2346 http://syriaca.org/place/144">Nineveh</placeName>
who was martyred under shah Shapur II.</note>
</person>
</listPerson>
</body>
</TEI>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment