Created
August 30, 2018 14:56
-
-
Save damonfeldman/70dcef2083120e6d1761ce97411cb928 to your computer and use it in GitHub Desktop.
MarkLogic random sampling functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
/** Random sampling of arrays or sequences. | |
Limitations: not 100% random. Tries to not start at 0 all the time, or get items at the exact end of list, but there are some biases. | |
Intended to get random blocks of contiguous items. Blocks are used for two reasons | |
1. it is sometimes nice to only get ranges, then get the data later to avoid a huge URI or ID pull up front. | |
2. each chunk is one pass through a loop, including random generation and adjustments. fewer/larger chunks goes faster than chunksize=1. | |
*/ | |
// var dbg=[] | |
// if the samples randomly exteded beyond the total size of the array, adjust them down till they fit | |
function stretchSqueeze(rangeStarts, tot, chunkSize) { | |
let avgIntervalSize = tot / rangeStarts.length | |
let last = rangeStarts[rangeStarts.length - 1] + chunkSize | |
if (last < (tot - avgIntervalSize)) { // sample is skewed heavily to beginning of list. Because there is no way to skew to end, dissalow | |
let underage = tot - last | |
let moveAheadAmt = underage - random(avgIntervalSize) | |
let amtPerRange = moveAheadAmt / rangeStarts.length | |
return rangeStarts.map((start, i) => start + (i+1)*amtPerRange) | |
} else if (last <= tot) { | |
return rangeStarts | |
} else { // random intervals ended up being past end of list. move all starts back to fit into range, but not overlap | |
let overage = last - tot | |
//dbg.push("overage="+overage) | |
let moveBackAmt = overage + random (avgIntervalSize) // random to end somewhere in the last expected interval, not always on the last item | |
//dbg.push("moveback="+moveBackAmt) | |
let amtPerRange = moveBackAmt / rangeStarts.length | |
//dbg.push("amt per range="+amtPerRange) | |
let starts = rangeStarts.map( | |
function(start, i){ return start - (i+1)*amtPerRange}) | |
return starts | |
} | |
} | |
/* decimal random with precision to 3 points after the decimal */ | |
function random(max) { | |
let randX1000 = xdmp.random(max*100000 - 1)+1 | |
return randX1000 / 100000 | |
} | |
/* calculate random range starting points in the interval [0..tot-1] anticipating ranges of size chunkSize | |
this returns decimals because rounding too often throws off the randomness. Need to truncate to ints later | |
anticipates a chunksize so we can compuete 1M random-ish values with only 10K iterations of the loop with each iteration computing a chunk of 100 | |
*/ | |
function computeRangeStarts(tot, samples, chunkSize) { | |
let rangeStarts = [] | |
let avgIntervalSize = tot / samples // the averge difference between the start of one sample and the start of the next | |
let lastStart = 0 | |
let at = 0 | |
for (let i=0;i<samples;i++) { | |
let skip = random(avgIntervalSize * 2) // skip from 0 to double the avg, to hit the average overall | |
if (skip <= chunkSize + 0.01) { | |
skip = chunkSize + 0.01 // ensure we skip ahead a full chunk to avoid overlap | |
} | |
at = at + skip | |
rangeStarts.push(at) | |
lastStart = at | |
}; | |
return stretchSqueeze(rangeStarts, tot, chunkSize) | |
} | |
function max(a,b) { return a > b ? a : b} | |
function min(a,b) { return a < b ? a : b} | |
function randomRanges(tot, samples, chunkSize) { | |
let rangeStarts = computeRangeStarts(tot, samples, chunkSize) | |
let minNextInt=0 | |
let rangeData = rangeStarts.reduce( | |
function(collectorArr, start) { | |
let idealStartInt = math.floor(start) | |
let startInt = max(minNextInt, idealStartInt) | |
let stopInt = startInt + chunkSize - 1 | |
minNextInt = stopInt+1 // sometimes the randoms can prodice the same number twice for very dense samples. Ensure we always move +1 at least | |
collectorArr.push({"start":startInt, "stop":stopInt}) | |
return collectorArr | |
}, | |
[]) | |
return rangeData | |
} | |
function randomUris(query, samples, chunkSize) { // todo generalize to any lexicon | |
let tot = cts.estimate(query) | |
if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total sequence size" | |
let rangeStarts = computeRangeStarts(tot, samples, chunkSize) | |
let minNextInt=0 | |
let lastUri = "" | |
let rangeData = rangeStarts.reduce( | |
function(collectorArr, start) { | |
let idealStartInt = math.floor(start) | |
let startInt = max(minNextInt, idealStartInt) | |
let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint | |
minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least | |
let uris = cts.uris(lastUri+" ", Sequence.from(["limit="+(stopInt-startInt)]), query).toArray() // concat a space to move past last item | |
lastUri=uris[uris.length-1] | |
return collectorArr.concat(uris) // tricky: stop is non-inclusive so just subtract | |
}, | |
[]) | |
return rangeData | |
} | |
function randomSampleChunks(dataArr, samples, chunkSize) { | |
let tot = dataArr.length | |
if (samples * chunkSize > tot) throw "sample size * num chunks is greater than total data size" | |
let rangeStarts = computeRangeStarts(dataArr.length, samples, chunkSize) | |
let minNextInt=0 | |
let rangeData = rangeStarts.reduce( | |
function(collectorArr, start) { | |
let idealStartInt = math.floor(start) | |
let startInt = max(minNextInt, idealStartInt) | |
let stopInt = startInt + chunkSize // includes endpoint, but that is ok, because slice() below ignores endpoint | |
minNextInt++ // sometimes the randoms can prodice the same numbrer twice for very dense samples. Ensure we always move +1 at least | |
return collectorArr.concat(dataArr.slice(startInt, stopInt)) // tricky: slice ignores end index, but see above | |
}, | |
[]) | |
return rangeData | |
} | |
///////////// ================== | |
let errors=[] | |
function test(cond, msg) { | |
if (!cond) errors.push(msg) | |
} | |
let tot = 1000 | |
let data = [] | |
for (let i=0; i<tot; i++) data.push(i); | |
// data = cts.uris().toArray() | |
let samples = 20 | |
let chunkSize = 4 | |
let res = randomSampleChunks(data, samples, chunkSize) | |
test(res.length===20*4, "expect 80 items. got "+res.length) | |
test(res[3]-res[0]===3, "expect first chunk of 4 to be contiguous. diff="+res[3]-res[0]) | |
test(res[7]-res[4]===3, "expect second chunk of 4 to be contiguous.") | |
test(res[11]-res[8]===3, "expect third chunk of 4 to be contiguous") | |
test(res[15]-res[12]===3, "expect fourth/last chunk of 4 to be contiguous") | |
test(res[15] <= 999, "last item must be at or below max of 999") | |
let ranges = randomRanges(tot, samples, chunkSize) | |
test(ranges.length===samples, "expected 20 ranges") | |
test(ranges[0].stop - ranges[0].start ===3, "expect first range to be size. size= ="+ (ranges[0].stop - ranges[0].start)) | |
test(ranges[19].stop <= 999, "last item must be at or below max of 999. was "+ranges[19].stop) | |
//--- test some very dense samples 990 items of 1000 --- | |
samples = 990 | |
chunkSize = 1 | |
res = randomSampleChunks(data, samples, chunkSize) | |
test(res.length===990, "expect 990 chunks. got "+res.length) | |
test(res[989] <= 999, "last item must be at or below max of 999") | |
test(res[989] > 995, "last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // about 4 or 5 gaps should be in the sequence randomly, not two at the very end | |
ranges = randomRanges(tot, samples, chunkSize) | |
test(ranges.length===990, "ranges: expect 990 chunks. got "+ranges.length) | |
// known bug. samples that incude pretty much all of the list can overrun slightly. | |
// appears to be because we occasionally advance the next range to avoid overlap/duplicate of the last stop point. If this happens too often, pushes last item past end. | |
test(ranges[989].stop <= 999, "known issue w dense ranges: last item must be at or below max of 999. was:"+ranges[989].stop) | |
test(ranges[989].stop > 997, "ranges: last item extremely unlikely to be other than 998 or 999. was "+ res[989]) // due to density. if all the gaps are at the very end, likely bug | |
//--- uneven numbers | |
tot = 1234 | |
samples = 77 | |
chunkSize = 11 | |
data = [] | |
for (let i=0; i<tot; i++) data.push(i); | |
res = randomSampleChunks(data, samples, chunkSize) | |
test(res.length===77*11, "expect 781 items. got "+res.length) | |
test(res[10]-res[0]===10, "uneven: expect first chunk of 11 to be contiguous") | |
test(res[21]-res[11]===10, "uneven: expect second chunk of 11 to be contiguous") | |
test(res[780] <= 1233, "uneven: last item must be at or below max of 999") | |
test(res[780] < 1220, "uneven: last item unlikely to be too close to end was:"+res[780]) | |
test(res[780] > 820, "uneven: last item unlikely to be very close to theoretical min of 781:"+res[780]) | |
res = randomUris(cts.trueQuery(), samples, chunkSize) | |
test(res.length===77*11, "expect 781 items. got "+res.length) | |
let uniqueArray = res.filter(function(item, pos) { return res.indexOf(item) == pos; }) | |
test(uniqueArray.length === res.length, "no duplicates") | |
let x= {"err":errors} | |
x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment