Last active
August 29, 2015 14:02
-
-
Save xquery/3b527eaf04a02dc7f693 to your computer and use it in GitHub Desktop.
simple markov chain example using xquery (requires MarkLogic)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0-ml"; | |
import module namespace functx = "http://www.functx.com" at | |
"/MarkLogic/functx/functx-1.0-nodoc-2007-01.xqy"; | |
(: extracts //p from web page and tokenizes to lower case words :) | |
declare function local:generate-corpus( | |
$uri | |
){ | |
let $context := xdmp:tidy(xdmp:http-get($uri)[2])[2] | |
let $content := fn:string-join($context//*:p,' ') | |
return | |
for $w in tokenize($content, '\W+') return lower-case($w) | |
}; | |
(: generates sentence of length $num-words :) | |
declare function local:generate-text( | |
$first-word, | |
$wordbase, | |
$num-words | |
){ | |
if($num-words eq 0) then () | |
else | |
let $new-word := local:choose-word($first-word) | |
return ( | |
$new-word, | |
local:generate-text($wordbase[@value eq $new-word], $wordbase, $num-words - 1) | |
) | |
}; | |
(: creates markov chain word database :) | |
declare function local:generate-wordbase( | |
$corpus | |
){ | |
for $word in distinct-values(for $w in $corpus return lower-case($w)) | |
return | |
let $following-words := index-of($corpus,$word) | |
let $following-word-list := for $following-word in $following-words | |
return $corpus[$following-word + 1] | |
let $map := map:map() | |
let $calc-word := | |
<word value="{$word}"> | |
{ | |
for $following-word in $following-words | |
let $w := $corpus[$following-word + 1] | |
return | |
map:put($map,$w,count(index-of($following-word-list ,$w))), | |
for $m in $map | |
return $m | |
}</word> | |
return $calc-word | |
}; | |
(: probablistic selection of next word, based on markov chain :) | |
declare function local:choose-word( | |
$word as element(word) | |
){ | |
let $values := $word//*:value/number(.) | |
let $sum := sum($word//*:value) | |
let $random := xdmp:random($sum) | |
let $r := $random | |
return | |
( | |
let $new-word := | |
for $v at $n in $values | |
let $total := sum($values[1 to $n]) | |
return if ($r eq $total or $r lt $total) then $word//*:entry[$n]/@key else () | |
return $new-word[1] | |
) | |
}; | |
(: generate corpus from uri, generate markov chains for all words contained in corpus, then generates text :) | |
let $corpus := local:generate-corpus("http://en.wikipedia.org/wiki/Dixie_Square_Mall") | |
let $wordbase := local:generate-wordbase($corpus) | |
let $r := xdmp:random(count($wordbase)) | |
let $first-word := $wordbase[$r] | |
return | |
string-join( | |
( | |
$first-word/@value, | |
local:generate-text($wordbase[$r],$wordbase, 10) | |
) | |
," ") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
example output - major incidents occurred at the remainder of the blues brothers film