Last active
May 29, 2019 15:54
-
-
Save masyukun/5d39245ca3202b5e5e2e5ee91b2a540e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0-ml"; | |
declare namespace p = "http://www.mycompany.com"; | |
declare option xdmp:mapping "false"; | |
(:~ | |
Returns z-score for the specified confidence value | |
For confidence values not in the lookup table, | |
solve the cubic equation I generated from a polynomial regression of Confidence Percentages to z-score with R^2 = 1 fit | |
@author Matthew Royal | |
@see https://matthewroyal.com/blog/wp-content/uploads/2015/05/Screen-Shot-2015-05-24-at-10.57.54-AM.png | |
:) | |
declare function local:getZScore($confidence as xs:double) { | |
if ($confidence ge 0.5 and $confidence le 1) then | |
if ($confidence eq 0.5) then 0.674 | |
else if ($confidence eq 0.8) then 1.282 | |
else if ($confidence eq 0.9) then 1.645 | |
else if ($confidence eq 0.95) then 1.96 | |
else if ($confidence eq 0.98) then 2.326 | |
else if ($confidence eq 0.99) then 2.576 | |
else | |
math:fabs(local:solveCubicEquation(0.0481, 0.4007, 1.1354, 0.098 + $confidence)) | |
else fn:error("ERROR: 0.5 <= $confidence <= 1.0, and input was " || fn:string-join($confidence)) | |
}; | |
(:~ | |
Solve a cubic polynomial for x, given coefficients a, b, c, d | |
in standard form = ax^3 - bx^2 + cx - d = 0. | |
Uses homegrown cubic root function sqrt3() because Xquery's math library | |
cannot raise a negative number to a decimal power. | |
@author Matthew Royal | |
@param $a 1st coefficient | |
@param $b 2nd coefficient | |
@param $c 3rd coefficient | |
@param $d 4th coefficient | |
@see http://www.math.vanderbilt.edu/~schectex/courses/cubic/ | |
:) | |
declare function local:solveCubicEquation($a as xs:double, $b as xs:double, $c as xs:double, $d as xs:double) { | |
let $big := ((-math:pow($b,3.0)) div (27.0 * math:pow($a,3.0))) + (($b * $c) div (6.0 * math:pow($a,2.0))) - ($d div (2.0 * $a)) | |
let $small := ($c div (3.0 * $a)) - (math:pow($b,2.0) div (9.0 * math:pow($a,2.0))) | |
let $tail := ($b div (3.0 * $a)) | |
let $x := math:pow($big + math:sqrt(math:pow($big,2.0) + math:pow($small,3.0)), (1.0 div 3.0)) + local:sqrt3( $big - math:sqrt(math:pow($big,2.0) + math:pow($small,3.0)) ) - $tail | |
return $x | |
}; | |
(:~ | |
Estimate the cube root of a number. | |
WARNINGS: 1) Does not work well with many digits due to xs:double size limitation. | |
2) Does not generate complex numbers. | |
@author Matthew Royal | |
@see http://www4.wittenberg.edu/academics/mathcomp/bjsdir/CubeRootTalk.pdf | |
:) | |
declare function local:sqrt3($number as xs:double) { | |
let $answer := () | |
let $negative := $number lt 0 | |
let $number := if ($negative) then $number * -1 else $number | |
let $numLength := fn:string-length(fn:replace(fn:string($number), "[^0-9]", "")) | |
let $hasDecimal := fn:matches(xs:string($number), "\.") | |
let $beforeDecimal := fn:replace(xs:string($number), "^([0-9]+?)\.[0-9]*$", "$1") | |
let $afterDecimal := if ($hasDecimal) then fn:replace(xs:string($number), "^[0-9]+?\.([0-9]*)$", "$1") else () | |
(: Group numbers by threes, starting before then after the decimal point :) | |
let $beforeGroups := fn:reverse( | |
for $i in (1 to xs:integer(math:ceil(fn:string-length($beforeDecimal) div 3))) | |
return xs:double(fn:replace($beforeDecimal, "^.*?([0-9]{1,3})[0-9]{"||($i - 1) * 3||"}$", "$1")) | |
) | |
let $beforeGroups := | |
if (fn:count($beforeGroups) eq 1 and $beforeDecimal eq "0") then () else $beforeGroups | |
let $afterGroups := | |
for $i in (1 to xs:integer(math:ceil(fn:string-length($afterDecimal) div 3))) | |
return xs:double(fn:replace($afterDecimal, "^[0-9]{"||($i - 1) * 3||"}([0-9]{1,3}).*?$", "$1")) | |
let $numPieces := ($beforeGroups, $afterGroups) | |
let $newLast := | |
let $num := xs:string($numPieces[fn:last()]) | |
return | |
if (fn:string-length($num) eq 1) then xs:double($num || "00") | |
else if (fn:string-length($num) eq 2) then xs:double($num || "0") | |
else () | |
let $numPieces := if ($newLast) then ($numPieces[1 to (fn:last() - 1)], $newLast) else $numPieces | |
let $numPieces := ( | |
$numPieces | |
, | |
let $precisionDeficit := ($numLength - fn:count($numPieces)) | |
return | |
if ($precisionDeficit gt 0) then | |
for $i in (1 to $precisionDeficit) | |
return "000" | |
else () | |
) | |
(: Find the largest cube of a single digit number less than the first group :) | |
let $largestCube := 1 | |
let $_ := | |
for $j in (2 to 9) | |
return | |
if (math:pow($j, 3) le $numPieces[1]) then | |
xdmp:set($largestCube, $j) | |
else () | |
let $_ := xdmp:set($answer, ($answer, xs:string($largestCube))) | |
let $diff := $numPieces[1] - math:pow($largestCube, 3) | |
let $diffNext := () | |
let $numAnswer := () | |
let $combo := () | |
(: Find the largest factor of a single digit number less than all the rest of the groups :) | |
let $_ := | |
try { | |
for $i in (2 to fn:count($numPieces) ) | |
let $numAnswer := xs:double(fn:string-join($answer)) * 10 | |
let $_ := xdmp:set($diffNext, xs:double(xs:string($diff) || $numPieces[$i])) | |
let $largestFactor := 1 | |
let $_ := | |
for $j in (1 to 9) | |
let $_ := xdmp:set($combo, $j * ((3 * math:pow($numAnswer,2)) + ((3 * $numAnswer) * $j) + math:pow($j,2))) | |
return | |
if ($combo le $diffNext) then | |
xdmp:set($largestFactor, $j) | |
else () | |
let $_ := xdmp:set($answer, ($answer, xs:string($largestFactor))) | |
return xdmp:set($diff, ($diffNext - ($largestFactor * ((3 * math:pow($numAnswer,2)) + ((3 * $numAnswer) * $largestFactor) + math:pow($largestFactor,2)))) ) | |
} catch ($exception) {} | |
return | |
xs:double(fn:string-join( | |
for $num at $i in $answer | |
return ( | |
if ($i - 1 eq fn:count($beforeGroups)) then | |
if (fn:count($beforeGroups) eq 1 and $beforeDecimal eq "0") then "0." | |
else "." | |
else () | |
, | |
$num | |
) | |
)) * (if ($negative) then -1 else 1) | |
}; | |
(:~ | |
Calculate the sample size for a given population and confidence. | |
@author Matthew Royal | |
@see http://www.surveysystem.com/sample-size-formula.htm | |
:) | |
declare function local:getSampleSize($population as xs:integer, $confidence as xs:double, $confidenceInterval as xs:double) { | |
let $confidence := local:getZScore($confidence) | |
let $choicePickedPercent := 0.5 (: default for computing sample size:) | |
let $sampleSize := | |
(math:pow($confidence, 2) * $choicePickedPercent * (1 - $choicePickedPercent)) | |
div math:pow($confidenceInterval, 2) | |
let $sampleSizePopulation := | |
$sampleSize div (1 + (($sampleSize - 1) div $population)) | |
return $sampleSizePopulation | |
}; | |
declare function local:computeSample($query as cts:query, $confidence as xs:double, $confidenceInterval as xs:double, $function as xdmp:function?, $numDocuments as xs:integer?) { | |
let $queryEstimate := xdmp:estimate(cts:search(/, $query)) | |
let $sampleSize := local:getSampleSize($queryEstimate, $confidence, $confidenceInterval) | |
return | |
if (fn:exists($function)) then | |
let $documents := cts:search(/, $query, ("score-random"))[1 to xs:integer(math:ceil($sampleSize))] | |
let $bools := xdmp:apply($function, $documents) | |
let $results := fn:count( | |
for $bool in $bools where $bool return $bool | |
) | |
return ( | |
"Query Estimate: " || $queryEstimate, "Sample size: " || $sampleSize | |
|| " " || $confidence * 100 | |
||"% +/-" || $confidenceInterval * 100 || "% confidence" | |
, | |
"XQuery function returned TRUE in " || (($results div math:ceil($sampleSize)) * 100.0) | |
|| "% of the cases (" || $results || " of " || xs:integer(math:ceil($sampleSize)) || "), " | |
|| "scaling to an estimated ~" || math:ceil(($results div math:ceil($sampleSize)) * $queryEstimate) | |
|| " documents for the entire population " || $queryEstimate || "." | |
, | |
let $numDocuments := (if (exists($numDocuments) and $numDocuments gt 0) then $numDocuments else xs:int(math:ceil($sampleSize))) | |
let $affectedDocs := $documents[let $p := fn:position() return $bools[$p] eq fn:true()]/fn:base-uri() | |
return ( | |
"Matching URIs (Up to "||$numDocuments||"):", | |
$affectedDocs[1 to $numDocuments] | |
) | |
) | |
else ( | |
"Query Estimate: " || $queryEstimate, "Sample size: " || $sampleSize | |
|| " " ||$confidence * 100 | |
||"% +/-" || $confidenceInterval * 100 || "% confidence" | |
) | |
}; | |
let $query := | |
cts:element-query( xs:QName("p:companyDirectory"), | |
cts:and-query(( | |
cts:element-query(xs:QName("p:employeeName"), cts:and-query(()) ), | |
cts:element-value-query(xs:QName("p:deleted"), "false"), | |
cts:element-value-query(xs:QName("p:isCurrentVersion"), "true"), | |
cts:element-value-query(xs:QName("p:hireYear"), "2015") | |
)) | |
) | |
let $function := function ($documents as item()*) as xs:boolean* { | |
for $doc in $documents | |
let $employeeNames := $doc//p:employeeName/fn:string() | |
return fn:exists( | |
( | |
for $employeeName in $employeeNames | |
where fn:string-length($employeeName) ge 60 | |
return $employeeName | |
)[1] | |
) | |
} | |
return local:computeSample($query, 0.95, 0.05, $function, 10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated default population from "5" to the rounded up sample size.