Created
June 27, 2012 10:17
-
-
Save grtjn/3003155 to your computer and use it in GitHub Desktop.
MarkLogic text collector plugin
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| xquery version "1.0-ml"; | |
| (: Copyright 2012 Grtjn. All Rights Reserved. :) | |
| declare namespace textscan = "http://grtjn.nl/marklogic/plugin/textscan"; | |
| import module namespace plugin = "http://marklogic.com/extension/plugin" at "/MarkLogic/plugin/plugin.xqy"; | |
| import module namespace info="http://marklogic.com/appservices/infostudio" at "/MarkLogic/appservices/infostudio/info.xqy"; | |
| import module namespace infodev="http://marklogic.com/appservices/infostudio/dev" at "/MarkLogic/appservices/infostudio/infodev.xqy"; | |
| declare namespace ml="http://marklogic.com/appservices/mlogic"; | |
| declare namespace lbl="http://marklogic.com/xqutils/labels"; | |
| (:~ Map of capabilities implemented by this Plugin. | |
| : | |
| : Required capabilities for all Collectors | |
| : - http://marklogic.com/appservices/infostudio/collector/model | |
| : - http://marklogic.com/appservices/infostudio/collector/start | |
| : - http://marklogic.com/appservices/string | |
| :) | |
| declare function textscan:capabilities( | |
| ) | |
| as map:map | |
| { | |
| let $map := map:map() | |
| let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/model", xdmp:function(xs:QName("textscan:model"))) | |
| let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/start", xdmp:function(xs:QName("textscan:start"))) | |
| let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/config-view", xdmp:function(xs:QName("textscan:config-view"))) | |
| let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/cancel", xdmp:function(xs:QName("textscan:cancel"))) | |
| let $_ := map:put($map, "http://marklogic.com/appservices/infostudio/collector/validate", xdmp:function(xs:QName("textscan:validate"))) | |
| let $_ := map:put($map, "http://marklogic.com/appservices/string", xdmp:function(xs:QName("textscan:string"))) | |
| return $map | |
| }; | |
| (:~ Data model underlying UI; represents the data to be passed into invoke :) | |
| declare function textscan:model( | |
| ) | |
| as element(plugin:plugin-model) | |
| { | |
| <plugin:plugin-model> | |
| <plugin:data> | |
| <dir hint="Enter directory here"/> | |
| <tokenize>[\n\r]+</tokenize> | |
| <wrap>text</wrap> | |
| <big as="xs:boolean">true</big> | |
| <max>100000</max> | |
| </plugin:data> | |
| </plugin:plugin-model> | |
| }; | |
| (:~ Invoke the plugin :) | |
| declare function textscan:start( | |
| $model as element(), | |
| $ticket-id as xs:string, | |
| $policy-deltas as element(info:options)? | |
| ) | |
| as empty-sequence() | |
| { | |
| let $dir := string($model/plugin:data/*:dir) | |
| let $function := xdmp:function(xs:QName("textscan:process-file")) | |
| return infodev:filesystem-walk($dir,$ticket-id,$function,$policy-deltas,$model) | |
| }; | |
| declare function textscan:process-file( | |
| $document as node()?, | |
| $source-location as xs:string, | |
| $ticket-id as xs:string, | |
| $policy-deltas as element(info:options)?, | |
| $context as item()? | |
| ) | |
| as xs:string* | |
| { | |
| let $tokenize := string($context/plugin:data/tokenize) | |
| let $wrap := string($context/plugin:data/wrap) | |
| let $big := (string($context/plugin:data/big) eq "true") | |
| let $max := $context/plugin:data/max[. != '']/xs:integer(.) | |
| let $document := | |
| if ($big) then | |
| infodev:get-text($source-location,$ticket-id,$policy-deltas) | |
| else | |
| infodev:get-file($source-location,$ticket-id,$policy-deltas) | |
| let $result := | |
| try { | |
| if ($tokenize != "") then | |
| let $lines := if (fn:exists($max)) then fn:tokenize($document, $tokenize)[1 to $max] else fn:tokenize($document, $tokenize) | |
| let $nr-lines := count($lines) | |
| (: get transaction-size from policy :) | |
| let $ticket := info:ticket($ticket-id) | |
| let $name := fn:data($ticket/info:policy-name) | |
| let $transaction-size := fn:data(infodev:effective-policy($name,())/info:max-docs-per-transaction) | |
| let $nr-transactions := ceiling($nr-lines div $transaction-size) | |
| (: set total documents and total transactions so UI displays collecting :) | |
| let $current-total := | |
| ($ticket/info:total-documents/xs:integer(.), 0)[1] | |
| let $current-trans := | |
| ($ticket/info:total-transactions/xs:integer(.), 0)[1] | |
| (: Increment upon current counts, but substract 1 on doc total for the file being split; it doesnt get loaded.. :) | |
| let $set-total := infodev:ticket-set-total-documents($ticket-id, $current-total + $nr-lines - 1) | |
| let $set-trans := infodev:ticket-set-total-transactions($ticket-id, $current-trans + $nr-transactions) | |
| (: create transactions by breaking document set into maps | |
| each map of documents is saved to the db in their own transaction :) | |
| for $transaction-nr in (1 to $nr-transactions) | |
| let $transaction := map:map() | |
| let $start := (($transaction-nr - 1) * $transaction-size) + 1 | |
| let $finish := min((($start - 1 + $transaction-size), $nr-lines)) | |
| let $put := | |
| for $line at $line-nr in $lines[$start to $finish] | |
| let $id := fn:concat($source-location, "-", ($start + $line-nr - 1), if ($wrap != "") then ".xml" else ".txt") | |
| return map:put($transaction,$id,if ($wrap != "") then element {$wrap} {$line} else text { $line }) | |
| (: the callback function for ingest :) | |
| let $function := xdmp:function(xs:QName("infodev:ingest")) | |
| return | |
| try { | |
| infodev:transaction($transaction,$ticket-id,$function,$policy-deltas,$transaction-nr,(),()) | |
| } catch($e) { | |
| infodev:handle-error($ticket-id, concat("transaction ",$transaction-nr), $e) | |
| } | |
| else | |
| let $text-name := fn:concat(fn:replace($source-location, "\.[^.]+$", ""), if ($wrap != "") then ".xml" else ".txt") | |
| return | |
| infodev:ingest(if ($wrap != "") then element {$wrap} {$document} else $document,$text-name,$ticket-id,$policy-deltas) | |
| } catch($e) { | |
| (infodev:handle-error($ticket-id, $source-location, $e), xdmp:log(fn:concat("ERROR",$e))) | |
| } | |
| return $result | |
| }; | |
| (:~ A stand-alone page to configure the collector :) | |
| declare function textscan:config-view( | |
| $model as element(plugin:plugin-model)?, | |
| $lang as xs:string, | |
| $submit-here as xs:string | |
| ) | |
| as element(plugin:config-view) | |
| { | |
| let $model := | |
| if (fn:exists($model)) then | |
| $model | |
| else | |
| textscan:model() | |
| return | |
| <config-view xmlns="http://marklogic.com/extension/plugin"> | |
| <html xmlns="http://www.w3.org/1999/xhtml"> | |
| <head> | |
| <title>iframe plugin configuration</title> | |
| </head> | |
| <body> | |
| <h2>{textscan:string("config-title", $model, $lang)}</h2> | |
| <form style="margin-top: 20px;" action="{$submit-here}" method="post"> | |
| { | |
| for $field in $model/plugin:data/* | |
| return | |
| textscan:insert-field($field, $model, $lang) | |
| } | |
| <!--label for="dir">{ textscan:string("dir-label", $model, $lang) }</label> | |
| <input type="text" name="dir" id="dir" style="width: 400px" value="{$dir}"/> | |
| <p style="color: rgb(125,125,125); font-style: italic;"> | |
| The full path on the remote host. This directory and its contents must be readable by MarkLogic. | |
| </p><br/> | |
| <label for="tokenize">{ textscan:string("tokenize-label", $model, $lang) }</label> | |
| <input type="text" name="tokenize" id="wrap" style="width: 400px" value="{$tokenize}"/> | |
| <p style="color: rgb(125,125,125); font-style: italic;"> | |
| Enter a tokenize pattern if the input files need to be split. | |
| </p><br/> | |
| <label for="wrap">{ textscan:string("wrap-label", $model, $lang) }</label> | |
| <input type="text" name="wrap" id="wrap" style="width: 400px" value="{$model/plugin:data/*:dir}"/> | |
| <select name="asxml" id="asxml"> | |
| { for $d in $truefalse-options/* return <option value="{$d/@value}">{if ($d/@value eq $as-xml) then $sel else () }{$d/string()}</option> } | |
| </select> | |
| <p style="color: rgb(125,125,125); font-style: italic;"> | |
| Choose True to wrap text in a 'text' element. | |
| </p><br/> | |
| <label for="big">{ textscan:string("big-label", $model, $lang) }</label> | |
| <select name="big" id="big"> | |
| { for $d in $truefalse-options/* return <option value="{$d/@value}">{if ($d/@value eq $big) then $sel else () }{$d/string()}</option> } | |
| </select> | |
| <p style="color: rgb(125,125,125); font-style: italic;"> | |
| Choose True to read text using the raw text reader function (options like encoding will not work). | |
| </p><br/--> | |
| <div style="position: absolute; bottom: 2px; right: 0px;"> | |
| <ml:submit label="Done"/> | |
| </div> | |
| </form> | |
| </body> | |
| </html> | |
| </config-view> | |
| }; | |
| declare function textscan:insert-field( | |
| $field as element(), | |
| $model as element(plugin:plugin-model), | |
| $lang as xs:string | |
| ) | |
| as item()* | |
| { | |
| let $field-name := local-name($field) | |
| let $field-value := string($field) | |
| let $field-hint := string($field/@hint) | |
| return ( | |
| <label for="dir">{ | |
| textscan:string(fn:concat($field-name, "-label"), $model, $lang) | |
| }: </label>, | |
| if ($field/@as = 'xs:boolean') then | |
| <select name="{$field-name}" id="{$field-name}">{ | |
| for $d in ('true', 'false') | |
| return | |
| <option value="{$d}">{ | |
| if ($d eq $field-value) then | |
| attribute selected { "selected" } | |
| else (), | |
| $d | |
| }</option> | |
| }</select> | |
| else | |
| <input type="text" name="{$field-name}" id="{$field-name}" style="width: 400px" value="{($field-value[. != ''], $field-hint)[1]}"/> | |
| , | |
| <p style="color: rgb(125,125,125); font-style: italic;">{ | |
| textscan:string(fn:concat($field-name, "-description"), $model, $lang) | |
| }</p>, | |
| <br/> | |
| ) | |
| }; | |
| declare function textscan:cancel( | |
| $ticket-id as xs:string | |
| ) | |
| as empty-sequence() | |
| { | |
| infodev:ticket-set-status($ticket-id,"cancelled") | |
| }; | |
| (:~ Validate a given model, return () if good, specific errors (with IDs) if problems :) | |
| declare function textscan:validate( | |
| $model as element(plugin:plugin-model) | |
| ) | |
| as element(plugin:report)* | |
| { | |
| if ((string-length($model/plugin:data/dir) eq 0) or ($model/plugin:data/dir eq $model/plugin:data/dir/@hint)) then | |
| <plugin:report id="dir">{textscan:string('empty-dir-error', $model, 'en')}</plugin:report> | |
| else () | |
| }; | |
| (:~ All labels needed for display are collected here. :) | |
| declare function textscan:string( | |
| $key as xs:string, | |
| $model as element(plugin:plugin-model)?, | |
| $lang as xs:string | |
| ) | |
| as xs:string? | |
| { | |
| let $labels := | |
| <lbl:labels xmlns:lbl="http://marklogic.com/xqutils/labels"> | |
| <!-- labels used by Info Studio framework! --> | |
| <lbl:label key="name"> | |
| <lbl:value xml:lang="en">Filesystem Text Directory</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="description"> | |
| <lbl:value xml:lang="en">{ | |
| if ($model) then | |
| string-join(( | |
| "Load the contents of text files from this directory on the server: ", "<br/>", | |
| for $field in $model/plugin:data/* | |
| let $field-name := fn:local-name($field) | |
| let $field-value := fn:string($field) | |
| let $label := textscan:string(fn:concat($field-name, '-label'), (), $lang) | |
| return | |
| fn:concat($label, ': ', $field-value, "<br/>") | |
| ), '') | |
| else | |
| "Load the contents of text files from a directory on the server" | |
| }</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="start-label"> | |
| <lbl:value xml:lang="en">Run</lbl:value> | |
| </lbl:label> | |
| <!-- labels for validate-errors --> | |
| <lbl:label key="empty-dir-error"> | |
| <lbl:value xml:lang="en">Specified directory must not be empty</lbl:value> | |
| </lbl:label> | |
| <!-- labels for config-view --> | |
| <lbl:label key="config-title"> | |
| <lbl:value xml:lang="en">Text Collector Configuration</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="dir-label"> | |
| <lbl:value xml:lang="en">Directory path</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="dir-description"> | |
| <lbl:value xml:lang="en">The full path on the remote host. This directory and its contents must be readable by MarkLogic.</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="tokenize-label"> | |
| <lbl:value xml:lang="en">Tokenize pattern</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="tokenize-description"> | |
| <lbl:value xml:lang="en">Enter a tokenize pattern if the input files need to be split.</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="wrap-label"> | |
| <lbl:value xml:lang="en">Wrap element</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="wrap-description"> | |
| <lbl:value xml:lang="en">Enter an element name to wrap text in that element. This is necessary for transformation with XSLT.</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="big-label"> | |
| <lbl:value xml:lang="en">Read text RAW</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="big-description"> | |
| <lbl:value xml:lang="en">Choose True to read text using the raw text reader function. Options like encoding will not work.</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="max-label"> | |
| <lbl:value xml:lang="en">Max tokens</lbl:value> | |
| </lbl:label> | |
| <lbl:label key="max-description"> | |
| <lbl:value xml:lang="en">Enter the max number of lines or tokens that need to be processed.</lbl:value> | |
| </lbl:label> | |
| </lbl:labels> | |
| return | |
| $labels/lbl:label[@key eq $key]/lbl:value[@xml:lang eq $lang]/string() | |
| }; | |
| (:~ ----------------Main, for registration---------------- :) | |
| plugin:register(textscan:capabilities(),"collector-text.xqy") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment