Skip to content

Instantly share code, notes, and snippets.

@grtjn
Created June 27, 2012 10:17
Show Gist options
  • Select an option

  • Save grtjn/3003155 to your computer and use it in GitHub Desktop.

Select an option

Save grtjn/3003155 to your computer and use it in GitHub Desktop.
MarkLogic text collector plugin
xquery version "1.0-ml";
(: Copyright 2012 Grtjn. All Rights Reserved. :)
declare namespace textscan = "http://grtjn.nl/marklogic/plugin/textscan";
import module namespace plugin = "http://marklogic.com/extension/plugin" at "/MarkLogic/plugin/plugin.xqy";
import module namespace info="http://marklogic.com/appservices/infostudio" at "/MarkLogic/appservices/infostudio/info.xqy";
import module namespace infodev="http://marklogic.com/appservices/infostudio/dev" at "/MarkLogic/appservices/infostudio/infodev.xqy";
declare namespace ml="http://marklogic.com/appservices/mlogic";
declare namespace lbl="http://marklogic.com/xqutils/labels";
(:~ Map of capabilities implemented by this Plugin.
:
: Required capabilities for all Collectors
: - http://marklogic.com/appservices/infostudio/collector/model
: - http://marklogic.com/appservices/infostudio/collector/start
: - http://marklogic.com/appservices/string
:)
declare function textscan:capabilities(
)
as map:map
{
let $map := map:map()
let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/model", xdmp:function(xs:QName("textscan:model")))
let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/start", xdmp:function(xs:QName("textscan:start")))
let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/config-view", xdmp:function(xs:QName("textscan:config-view")))
let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/cancel", xdmp:function(xs:QName("textscan:cancel")))
let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/validate", xdmp:function(xs:QName("textscan:validate")))
let $_ := map:put($map, "http://marklogic.com/appservices/string", xdmp:function(xs:QName("textscan:string")))
return $map
};
(:~ Data model underlying UI; represents the data to be passed into invoke :)
declare function textscan:model(
)
as element(plugin:plugin-model)
{
<plugin:plugin-model>
<plugin:data>
<dir hint="Enter directory here"/>
<tokenize>[\n\r]+</tokenize>
<wrap>text</wrap>
<big as="xs:boolean">true</big>
<max>100000</max>
</plugin:data>
</plugin:plugin-model>
};
(:~ Invoke the plugin :)
declare function textscan:start(
$model as element(),
$ticket-id as xs:string,
$policy-deltas as element(info:options)?
)
as empty-sequence()
{
let $dir := string($model/plugin:data/*:dir)
let $function := xdmp:function(xs:QName("textscan:process-file"))
return infodev:filesystem-walk($dir,$ticket-id,$function,$policy-deltas,$model)
};
declare function textscan:process-file(
$document as node()?,
$source-location as xs:string,
$ticket-id as xs:string,
$policy-deltas as element(info:options)?,
$context as item()?
)
as xs:string*
{
let $tokenize := string($context/plugin:data/tokenize)
let $wrap := string($context/plugin:data/wrap)
let $big := (string($context/plugin:data/big) eq "true")
let $max := $context/plugin:data/max[. != '']/xs:integer(.)
let $document :=
if ($big) then
infodev:get-text($source-location,$ticket-id,$policy-deltas)
else
infodev:get-file($source-location,$ticket-id,$policy-deltas)
let $result :=
try {
if ($tokenize != "") then
let $lines := if (fn:exists($max)) then fn:tokenize($document, $tokenize)[1 to $max] else fn:tokenize($document, $tokenize)
let $nr-lines := count($lines)
(: get transaction-size from policy :)
let $ticket := info:ticket($ticket-id)
let $name := fn:data($ticket/info:policy-name)
let $transaction-size := fn:data(infodev:effective-policy($name,())/info:max-docs-per-transaction)
let $nr-transactions := ceiling($nr-lines div $transaction-size)
(: set total documents and total transactions so UI displays collecting :)
let $current-total :=
($ticket/info:total-documents/xs:integer(.), 0)[1]
let $current-trans :=
($ticket/info:total-transactions/xs:integer(.), 0)[1]
(: Increment upon current counts, but substract 1 on doc total for the file being split; it doesnt get loaded.. :)
let $set-total := infodev:ticket-set-total-documents($ticket-id, $current-total + $nr-lines - 1)
let $set-trans := infodev:ticket-set-total-transactions($ticket-id, $current-trans + $nr-transactions)
(: create transactions by breaking document set into maps
each map of documents is saved to the db in their own transaction :)
for $transaction-nr in (1 to $nr-transactions)
let $transaction := map:map()
let $start := (($transaction-nr - 1) * $transaction-size) + 1
let $finish := min((($start - 1 + $transaction-size), $nr-lines))
let $put :=
for $line at $line-nr in $lines[$start to $finish]
let $id := fn:concat($source-location, "-", ($start + $line-nr - 1), if ($wrap != "") then ".xml" else ".txt")
return map:put($transaction,$id,if ($wrap != "") then element {$wrap} {$line} else text { $line })
(: the callback function for ingest :)
let $function := xdmp:function(xs:QName("infodev:ingest"))
return
try {
infodev:transaction($transaction,$ticket-id,$function,$policy-deltas,$transaction-nr,(),())
} catch($e) {
infodev:handle-error($ticket-id, concat("transaction ",$transaction-nr), $e)
}
else
let $text-name := fn:concat(fn:replace($source-location, "\.[^.]+$", ""), if ($wrap != "") then ".xml" else ".txt")
return
infodev:ingest(if ($wrap != "") then element {$wrap} {$document} else $document,$text-name,$ticket-id,$policy-deltas)
} catch($e) {
(infodev:handle-error($ticket-id, $source-location, $e), xdmp:log(fn:concat("ERROR",$e)))
}
return $result
};
(:~ A stand-alone page to configure the collector :)
declare function textscan:config-view(
$model as element(plugin:plugin-model)?,
$lang as xs:string,
$submit-here as xs:string
)
as element(plugin:config-view)
{
let $model :=
if (fn:exists($model)) then
$model
else
textscan:model()
return
<config-view xmlns="http://marklogic.com/extension/plugin">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>iframe plugin configuration</title>
</head>
<body>
<h2>{textscan:string("config-title", $model, $lang)}</h2>
<form style="margin-top: 20px;" action="{$submit-here}" method="post">
{
for $field in $model/plugin:data/*
return
textscan:insert-field($field, $model, $lang)
}
<!--label for="dir">{ textscan:string("dir-label", $model, $lang) }</label>
<input type="text" name="dir" id="dir" style="width: 400px" value="{$dir}"/>
<p style="color: rgb(125,125,125); font-style: italic;">
The full path on the remote host. This directory and its contents must be readable by MarkLogic.
</p><br/>
<label for="tokenize">{ textscan:string("tokenize-label", $model, $lang) }</label>
<input type="text" name="tokenize" id="wrap" style="width: 400px" value="{$tokenize}"/>
<p style="color: rgb(125,125,125); font-style: italic;">
Enter a tokenize pattern if the input files need to be split.
</p><br/>
<label for="wrap">{ textscan:string("wrap-label", $model, $lang) }</label>
<input type="text" name="wrap" id="wrap" style="width: 400px" value="{$model/plugin:data/*:dir}"/>
<select name="asxml" id="asxml">
{ for $d in $truefalse-options/* return <option value="{$d/@value}">{if ($d/@value eq $as-xml) then $sel else () }{$d/string()}</option> }
</select>
<p style="color: rgb(125,125,125); font-style: italic;">
Choose True to wrap text in a 'text' element.
</p><br/>
<label for="big">{ textscan:string("big-label", $model, $lang) }</label>
<select name="big" id="big">
{ for $d in $truefalse-options/* return <option value="{$d/@value}">{if ($d/@value eq $big) then $sel else () }{$d/string()}</option> }
</select>
<p style="color: rgb(125,125,125); font-style: italic;">
Choose True to read text using the raw text reader function (options like encoding will not work).
</p><br/-->
<div style="position: absolute; bottom: 2px; right: 0px;">
<ml:submit label="Done"/>
</div>
</form>
</body>
</html>
</config-view>
};
declare function textscan:insert-field(
$field as element(),
$model as element(plugin:plugin-model),
$lang as xs:string
)
as item()*
{
let $field-name := local-name($field)
let $field-value := string($field)
let $field-hint := string($field/@hint)
return (
<label for="dir">{
textscan:string(fn:concat($field-name, "-label"), $model, $lang)
}: </label>,
if ($field/@as = 'xs:boolean') then
<select name="{$field-name}" id="{$field-name}">{
for $d in ('true', 'false')
return
<option value="{$d}">{
if ($d eq $field-value) then
attribute selected { "selected" }
else (),
$d
}</option>
}</select>
else
<input type="text" name="{$field-name}" id="{$field-name}" style="width: 400px" value="{($field-value[. != ''], $field-hint)[1]}"/>
,
<p style="color: rgb(125,125,125); font-style: italic;">{
textscan:string(fn:concat($field-name, "-description"), $model, $lang)
}</p>,
<br/>
)
};
declare function textscan:cancel(
$ticket-id as xs:string
)
as empty-sequence()
{
infodev:ticket-set-status($ticket-id,"cancelled")
};
(:~ Validate a given model, return () if good, specific errors (with IDs) if problems :)
declare function textscan:validate(
$model as element(plugin:plugin-model)
)
as element(plugin:report)*
{
if ((string-length($model/plugin:data/dir) eq 0) or ($model/plugin:data/dir eq $model/plugin:data/dir/@hint)) then
<plugin:report id="dir">{textscan:string('empty-dir-error', $model, 'en')}</plugin:report>
else ()
};
(:~ All labels needed for display are collected here. :)
declare function textscan:string(
$key as xs:string,
$model as element(plugin:plugin-model)?,
$lang as xs:string
)
as xs:string?
{
let $labels :=
<lbl:labels xmlns:lbl="http://marklogic.com/xqutils/labels">
<!-- labels used by Info Studio framework! -->
<lbl:label key="name">
<lbl:value xml:lang="en">Filesystem Text Directory</lbl:value>
</lbl:label>
<lbl:label key="description">
<lbl:value xml:lang="en">{
if ($model) then
string-join((
"Load the contents of text files from this directory on the server: ", "&lt;br/&gt;",
for $field in $model/plugin:data/*
let $field-name := fn:local-name($field)
let $field-value := fn:string($field)
let $label := textscan:string(fn:concat($field-name, '-label'), (), $lang)
return
fn:concat($label, ': ', $field-value, "&lt;br/&gt;")
), '')
else
"Load the contents of text files from a directory on the server"
}</lbl:value>
</lbl:label>
<lbl:label key="start-label">
<lbl:value xml:lang="en">Run</lbl:value>
</lbl:label>
<!-- labels for validate-errors -->
<lbl:label key="empty-dir-error">
<lbl:value xml:lang="en">Specified directory must not be empty</lbl:value>
</lbl:label>
<!-- labels for config-view -->
<lbl:label key="config-title">
<lbl:value xml:lang="en">Text Collector Configuration</lbl:value>
</lbl:label>
<lbl:label key="dir-label">
<lbl:value xml:lang="en">Directory path</lbl:value>
</lbl:label>
<lbl:label key="dir-description">
<lbl:value xml:lang="en">The full path on the remote host. This directory and its contents must be readable by MarkLogic.</lbl:value>
</lbl:label>
<lbl:label key="tokenize-label">
<lbl:value xml:lang="en">Tokenize pattern</lbl:value>
</lbl:label>
<lbl:label key="tokenize-description">
<lbl:value xml:lang="en">Enter a tokenize pattern if the input files need to be split.</lbl:value>
</lbl:label>
<lbl:label key="wrap-label">
<lbl:value xml:lang="en">Wrap element</lbl:value>
</lbl:label>
<lbl:label key="wrap-description">
<lbl:value xml:lang="en">Enter an element name to wrap text in that element. This is necessary for transformation with XSLT.</lbl:value>
</lbl:label>
<lbl:label key="big-label">
<lbl:value xml:lang="en">Read text RAW</lbl:value>
</lbl:label>
<lbl:label key="big-description">
<lbl:value xml:lang="en">Choose True to read text using the raw text reader function. Options like encoding will not work.</lbl:value>
</lbl:label>
<lbl:label key="max-label">
<lbl:value xml:lang="en">Max tokens</lbl:value>
</lbl:label>
<lbl:label key="max-description">
<lbl:value xml:lang="en">Enter the max number of lines or tokens that need to be processed.</lbl:value>
</lbl:label>
</lbl:labels>
return
$labels/lbl:label[@key eq $key]/lbl:value[@xml:lang eq $lang]/string()
};
(:~ ----------------Main, for registration---------------- :)
plugin:register(textscan:capabilities(),"collector-text.xqy")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment