Forked from joewiz/get-tei-articles-collection-summary.xq
Created
August 7, 2013 10:23
-
-
Save emchateau/6172872 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
(: find the shortest and longest article and get the average word count of a collection of TEI XML articles :) | |
declare namespace tei="http://www.tei-c.org/ns/1.0"; | |
(: in our case, 'articles' are TEI divs that have @xml:id attributes and no child divs; | |
we filter out the foreward since they're not full articles. :) | |
let $milestone-articles := collection('/db/cms/apps/tei-content/data/milestones')//tei:div[@xml:id and not(.//tei:div)][@xml:id ne 'foreword'] | |
let $article-infos := | |
for $article in $milestone-articles | |
let $text := string-join($article//text(), ' ') (: :) | |
let $words := tokenize($text, '\s+') | |
let $word-count := count($words) | |
order by $word-count | |
return | |
<article> | |
<url>{concat(substring-after(substring-before(base-uri($article), '.xml'), 'milestones/'), '/', $article/@xml:id)}</url> | |
<word-count>{$word-count}</word-count> | |
</article> | |
return | |
element results { | |
element article-count { count($milestone-articles) }, | |
element shortest-article { concat($article-infos[1]/url, ': ', $article-infos[1]/word-count, ' words') }, | |
element longest-article { concat($article-infos[last()]/url, ': ', $article-infos[last()]/word-count, ' words') }, | |
element average-length { concat(round(sum($article-infos/word-count) div count($milestone-articles)), ' words') } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<results> | |
<article-count>162</article-count> | |
<shortest-article>1899-1913/DollarDiplo: 172 words</shortest-article> | |
<longest-article>1945-1952/KoreanWar2: 1662 words</longest-article> | |
<average-length>739 words</average-length> | |
</results> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment