-
-
Save wsalesky/03b8637f9016e13aa482 to your computer and use it in GitHub Desktop.
Repurpose @joewiz highlight-match.xql to do primitive NER on Syriaca.org data. Matches against Syriaca.org English place names. XQuery highlights regex pattern matches in XML while preserving node structure, with XQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
declare namespace tei = "http://www.tei-c.org/ns/1.0"; | |
declare namespace fn="http://www.w3.org/2005/xpath-functions"; | |
(: | |
: A simple addaptation of @joewiz highlight-match.xql [https://gist.github.com/joewiz/5937897] for POC of simple NER based on predefined list. | |
: Allow placeName recognition against a pre determined list of names, in this case uses place names in https://github.com/srophe/srophe-app-data | |
: Names a wrapped in a placeName tag with a @ref attribute linking them to the authority record in https://github.com/srophe/srophe-app-data | |
:) | |
(: Search within $nodes for matches to a regular expression $pattern and apply a $highlight function :) | |
declare function local:highlight-matches($nodes as node()*, $pattern as xs:string*, $highlight as function(xs:string) as item()* ) { | |
for $node in $nodes | |
return | |
typeswitch ( $node ) | |
(: Does not wrap names found within existing placeNames :) | |
case element(tei:placeName) return | |
$node | |
case element() return | |
(: Use QName to preserve namespaces :) | |
element { QName(namespace-uri($node), local-name($node)) } { $node/@*, local:highlight-matches($node/node(), $pattern, $highlight) } | |
case text() return | |
let $normalized := replace($node, '\s+', ' ') | |
for $segment in analyze-string($normalized, $pattern)/node() | |
return | |
if ($segment instance of element(fn:match)) then | |
$highlight($segment/string()) | |
else | |
$segment/string() | |
case document-node() return | |
document { local:highlight-matches($node/node(), $pattern, $highlight) } | |
default return | |
$node | |
}; | |
(: Gets URI for place record(s) with matching strings, english names only for now :) | |
declare function local:get-id($string as xs:string) as xs:string*{ | |
let $id := collection('PATH-TO-PLACE-RECS')//tei:placeName[@xml:lang='en'][. = $string]/following-sibling::tei:idno[@type='URI'][starts-with(.,'http://syriaca.org')] | |
return $id/text() | |
}; | |
(: Test TEI record :) | |
let $node := | |
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:syriaca="http://syriaca.org" xml:lang="en"> | |
<body> | |
<listPerson> | |
<person xml:id="saint-1094" ana="#syriaca-saint"> | |
<note>Fiey provides the following bibliographic citations: <quote source="#bibl1094-1">Fiey, "Bar 'Eta", 10-14. BSO, I, 4- 5 (J. Habb).</quote> | |
</note> | |
<note>Testing another name Babai of Nisibis this one with 3 words</note> | |
<note>Testing names <placeName>Ayn Zarba</placeName></note> | |
<note type="abstract">Abā was a Bishop of Nineveh who was martyred under shah Shapur II.</note> | |
</person> | |
</listPerson> | |
</body> | |
</TEI> | |
(: List of names to be matched against in local:highlight-matches() :) | |
(: Not a lot of wiggle room in names, must be pretty much an exact match, can be a problem with different variations in transliteration re: diacritics :) | |
(: Names are sorted by string-length to ensure the best possible match, as regex will match on the first matching pattern :) | |
let $pattern := | |
string-join( | |
for $placeName in collection('PATH-TO-PLACE-RECS')//tei:placeName[@xml:lang='en'] | |
order by string-length($placeName) descending | |
return $placeName/text(),'|') | |
(: Wrap matches in placeName with correct URI in @ref:) | |
let $highlight := function($string as xs:string) { <placeName ref="{local:get-id($string)}">{$string}</placeName> } | |
(: Return result :) | |
return local:highlight-matches($node, $pattern, $highlight) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<TEI xml:lang="en"> | |
<body> | |
<listPerson> | |
<person xml:id="saint-1094" ana="#syriaca-saint"> | |
<note>Fiey provides the following bibliographic citations: | |
<quote source="#bibl1094-1">Fiey, "Bar 'Eta", 10-14. BSO, I, 4- 5 (J. Habb).</quote> | |
</note> | |
<note>Testing another name | |
<placeName ref="http://syriaca.org/place/609">Babai of Nisibis</placeName> | |
this one with 3 words</note> | |
<note>Testing names | |
<placeName xmlns="http://www.tei-c.org/ns/1.0">Ayn Zarba</placeName> | |
</note> | |
<note type="abstract">Abā was a Bishop of | |
<placeName ref="http://syriaca.org/place/2350 http://syriaca.org/place/2346 http://syriaca.org/place/144">Nineveh</placeName> | |
who was martyred under shah Shapur II.</note> | |
</person> | |
</listPerson> | |
</body> | |
</TEI> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment