Skip to content

Instantly share code, notes, and snippets.

@jeffreycwitt
Created October 11, 2015 21:13
Show Gist options
  • Save jeffreycwitt/272187b3cd8b2fa7665f to your computer and use it in GitHub Desktop.
Save jeffreycwitt/272187b3cd8b2fa7665f to your computer and use it in GitHub Desktop.
existdb-xquery word frequency report with Lucene index
xquery version "3.0";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option exist:serialize "method=html media-type=text/html ident=no";
let $collection := "scta/lombardsententia"
let $terms :=
<terms>
{
util:index-keys(
collection(concat("/db/apps/", $collection))//tei:p,
"",
function($key, $count) {
<term name="{$key}" count="{$count[1]}"
docs="{$count[2]}"/>
}, -1, "lucene-index")
}
</terms>
let $sum := sum($terms//@count)
return
<html>
<head>
</head>
<body>
<h1>Frequency analysis for {$collection}</h1>
<h2>Total word count: {$sum}</h2>
<table>
<tr>
<td>Term</td>
<td>Frequency</td>
<td>Percentage</td>
</tr>
{
for $term in $terms//term
let $frequency := $term/@count/string()
let $percentage := format-number(($term/@count div $sum), "%.00")
return
<tr>
<td>{$term/@name/string()}</td>
<td>{$frequency}</td>
<td>%{$percentage}</td>
</tr>
}
</table>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment