Last active
October 7, 2016 15:06
-
-
Save joewiz/ad32ad9ceb1649269fc1093db159a9e2 to your computer and use it in GitHub Desktop.
Find FRUS documents whose English dates do not match the date metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(: | |
Find cases such as this: | |
<dateline> | |
<placeName>Washington</placeName>, | |
<date when="1971-10-05">October 15, 1971</date>. | |
</dateline> | |
... where the supplied English date, October 15, 1971, does not match the | |
supplied machine-readable date, 1971-10-05. | |
:) | |
declare namespace tei="http://www.tei-c.org/ns/1.0"; | |
import module namespace dates="http://xqdev.com/dateparser" at "/db/apps/twitter/modules/date-parser.xqm"; | |
let $vols := | |
doc('/db/apps/frus/volumes/frus1969-76v17.xml') | |
(: collection('/db/apps/frus/volumes'):) | |
let $datelines := $vols//tei:dateline[.//tei:date/@when] | |
let $report := | |
element report { | |
for $dateline in $datelines | |
let $div-id := $dateline/ancestor::tei:div[@xml:id][1]/@xml:id | |
let $vol-id := util:document-name($dateline) ! substring-before(., '.xml') | |
let $supplied-english-date := $dateline//tei:date[@when][1] | |
let $supplied-when-attribute := $supplied-english-date/@when | |
let $english-date-regex := '[A-Z][a-z]+\s+\d{1,2},\s+\d{4}' | |
let $parsed-english-date := analyze-string(normalize-space($supplied-english-date), $english-date-regex)//fn:match | |
let $parsed-iso-date := | |
try | |
{ | |
if ($parsed-english-date) then | |
dates:parseDate($parsed-english-date)/string() | |
else | |
dates:parseDate(normalize-space($supplied-english-date)) | |
} | |
catch * | |
{ | |
"parseDate had problems with " || normalize-space($supplied-english-date) | |
} | |
let $matches := substring($supplied-when-attribute, 1, 10) = $parsed-iso-date | |
where not($matches) | |
return | |
element doc { | |
element vol-id { $vol-id }, | |
element div-id { $div-id/string() }, | |
element supplied-english-date { normalize-space($supplied-english-date) }, | |
element supplied-when-attribute { $supplied-when-attribute/string() }, | |
element parsed-english-date { $parsed-english-date/string() }, | |
element parsed-iso-date { $parsed-iso-date }, | |
element matches { $matches } | |
} | |
} | |
return | |
xmldb:store('/db', 'report.xml', $report) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<report> | |
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d142</div-id> | |
<supplied-english-date>July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</supplied-english-date> | |
<supplied-when-attribute>1971-07-11T10:35:00</supplied-when-attribute> | |
<parsed-english-date/> | |
<parsed-iso-date>parseDate had problems with July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</parsed-iso-date> | |
<matches>false</matches> | |
</doc> | |
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d159</div-id> | |
<supplied-english-date>October 15, 1971</supplied-english-date> | |
<supplied-when-attribute>1971-10-05</supplied-when-attribute> | |
<parsed-english-date>October 15, 1971</parsed-english-date> | |
<parsed-iso-date>1971-10-15</parsed-iso-date> | |
<matches>false</matches> | |
</doc> | |
<doc> | |
<vol-id>frus1969-76v17</vol-id> | |
<div-id>d164</div-id> | |
<supplied-english-date>November 1971</supplied-english-date> | |
<supplied-when-attribute>1971-11</supplied-when-attribute> | |
<parsed-english-date/> | |
<parsed-iso-date> | |
<date resolution="month"> | |
<range> | |
<start>1971-11-01</start> | |
<end>1971-11-30</end> | |
</range> | |
<value>1971-11-01</value> | |
</date> | |
</parsed-iso-date> | |
<matches>false</matches> | |
</doc> | |
</report> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment