-
-
Save WaxCylinderRevival/36e7ffa53317a0f5255596aaee18a8e5 to your computer and use it in GitHub Desktop.
Find FRUS documents whose English dates do not match the date metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(: | |
Find cases such as this: | |
<dateline> | |
<placeName>Washington</placeName>, | |
<date when="1971-10-05">October 15, 1971</date>. | |
</dateline> | |
... where the supplied English date, October 15, 1971, does not match the | |
supplied machine-readable date, 1971-10-05. | |
:) | |
declare namespace tei="http://www.tei-c.org/ns/1.0"; | |
import module namespace dates="http://xqdev.com/dateparser" at "/db/apps/twitter/modules/date-parser.xqm"; | |
let $vols := | |
doc('/db/apps/frus/volumes/frus1969-76v17.xml') | |
(: collection('/db/apps/frus/volumes'):) | |
let $datelines := $vols//tei:dateline[.//tei:date/@when] | |
for $dateline in $datelines | |
let $div-id := $dateline/ancestor::tei:div[@xml:id][1]/@xml:id | |
let $vol-id := util:document-name($dateline) ! substring-before(., '.xml') | |
let $supplied-english-date := $dateline//tei:date[@when][1] | |
let $supplied-when-attribute := $supplied-english-date/@when | |
let $english-date-regex := '[A-Z][a-z]+\s+\d{1,2},\s+\d{4}' | |
let $parsed-english-date := analyze-string($supplied-english-date, $english-date-regex)//fn:match | |
let $parsed-iso-date := | |
try | |
{ | |
(: To improve likelihood of success, we'll try parsing the pre-processed date :) | |
if ($parsed-english-date) then | |
dates:parseDate($parsed-english-date)/string() | |
(: But if the date doesn't match our regex, we might as well let the dates module try on the raw date :) | |
else | |
dates:parseDate($supplied-english-date) | |
} | |
catch * | |
{ | |
"parseDate had problems with " || $supplied-english-date | |
} | |
let $matches := substring($supplied-when-attribute, 1, 10) = $parsed-iso-date | |
(: limit results to cases where the date portion of the supplied when attribute doesn't match the iso-date :) | |
where not($matches) | |
return | |
element doc { | |
element vol-id { $vol-id }, | |
element div-id { $div-id/string() }, | |
element supplied-english-date { $supplied-english-date/string() }, | |
element supplied-when-attribute { $supplied-when-attribute/string() }, | |
element parsed-english-date { $parsed-english-date/string() }, | |
element parsed-iso-date { $parsed-iso-date }, | |
element matches { $matches } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d142</div-id> | |
<supplied-english-date>July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</supplied-english-date> | |
<supplied-when-attribute>1971-07-11T10:35:00</supplied-when-attribute> | |
<parsed-english-date/> | |
<parsed-iso-date>parseDate had problems with July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</parsed-iso-date> | |
<matches>false</matches> | |
</doc> | |
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d159</div-id> | |
<supplied-english-date>October 15, 1971</supplied-english-date> | |
<supplied-when-attribute>1971-10-05</supplied-when-attribute> | |
<parsed-english-date>October 15, 1971</parsed-english-date> | |
<parsed-iso-date>1971-10-15</parsed-iso-date> | |
<matches>false</matches> | |
</doc> | |
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d164</div-id> | |
<supplied-english-date>November 1971</supplied-english-date> | |
<supplied-when-attribute>1971-11</supplied-when-attribute> | |
<parsed-english-date/> | |
<parsed-iso-date> | |
<date resolution="month"> | |
<range> | |
<start>1971-11-01</start> | |
<end>1971-11-30</end> | |
</range> | |
<value>1971-11-01</value> | |
</date> | |
</parsed-iso-date> | |
<matches>false</matches> | |
</doc> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment