Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active December 20, 2015 17:29
Show Gist options
  • Save joewiz/6168687 to your computer and use it in GitHub Desktop.
Save joewiz/6168687 to your computer and use it in GitHub Desktop.
Find the shortest and longest article in a collection of TEI XML articles by word count, and calculate the average word count, using XQuery
xquery version "3.0";
(: find the shortest and longest article and get the average word count of a collection of TEI XML articles :)
declare namespace tei="http://www.tei-c.org/ns/1.0";
(: in our case, 'articles' are TEI divs that have @xml:id attributes and no child divs;
we filter out the foreward since they're not full articles. :)
let $milestone-articles := collection('/db/cms/apps/tei-content/data/milestones')//tei:div[@xml:id and not(.//tei:div)][@xml:id ne 'foreword']
let $article-infos :=
for $article in $milestone-articles
let $text := string-join($article//text(), ' ') (: :)
let $words := tokenize($text, '\s+')
let $word-count := count($words)
order by $word-count
return
<article>
<url>{concat(substring-after(substring-before(base-uri($article), '.xml'), 'milestones/'), '/', $article/@xml:id)}</url>
<word-count>{$word-count}</word-count>
</article>
return
element results {
element article-count { count($milestone-articles) },
element shortest-article { concat($article-infos[1]/url, ': ', $article-infos[1]/word-count, ' words') },
element longest-article { concat($article-infos[last()]/url, ': ', $article-infos[last()]/word-count, ' words') },
element average-length { concat(round(sum($article-infos/word-count) div count($milestone-articles)), ' words') }
}
<results>
<article-count>162</article-count>
<shortest-article>1899-1913/DollarDiplo: 172 words</shortest-article>
<longest-article>1945-1952/KoreanWar2: 1662 words</longest-article>
<average-length>739 words</average-length>
</results>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment