Last active
August 29, 2015 14:23
-
-
Save brodieG/dad62cb00f8337635874 to your computer and use it in GitHub Desktop.
Code for SO Q: Random sample of character vector, without elements prefixing one another
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sample0110 <- function(size, n, complete.only=FALSE) { | |
size <- as.integer(size) | |
n <- as.integer(n) | |
if(size > 25 || size < 3L) stop( | |
"Currently size min is 3 and max is 25, though should be possible to allow ", | |
"smaller and larger with some changes" | |
) | |
# Generate integer pool and weights | |
size0 <- size - 1L | |
pool.raw <- seq.int(2L ^ size) - 1L | |
pool.raw.len <- valid.unique <- length(pool.raw) | |
# weights are a function of how many trailing zeroes each number has, for | |
# example `1000` has three trailing zeroes and represnts `1000`, `100`, | |
# `10`, and `1`, so it should be weighed 4x | |
weights <- rep(1L, pool.raw.len) | |
for(i in seq.int(size0)) | |
weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L | |
# Create indices to map from the "weighted" vectors to the original | |
# vectors | |
pool.vals <- rep(pool.raw, weights) | |
pool.len <- length(pool.vals) | |
# For each repeated value, what count of trailing zeros does it correspond | |
# to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster) | |
z <- integer(pool.len) | |
z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L | |
w <- cumsum(!z) | |
t <- cummax(z * w) | |
zeros.imp <- w - t + 1L | |
pad.imp <- weights[pool.vals + 1L] - zeros.imp | |
# Generate our encoded vectors by right padding with enough zeros and then | |
# adding as a value the number of zeros to the padded area | |
zero.pad <- as.integer(2L ^ ceiling(log2(size))) | |
vals.enc <- vals.enc.init <- pool.vals * zero.pad + zeros.imp - 1L | |
# Results tracking | |
res <- matrix(0L, nrow=n, ncol=2L) | |
res[, 2L] <- size # leads to "" if not changed | |
max.allowed <- size0 # how padded a number to pick can be | |
free <- rep(TRUE, pool.len) | |
# Pre compute frequently used sequences and number patterns | |
zero.mx <- as.integer(2 ^ (size - seq(size))) * | |
!lower.tri(matrix(ncol=size, nrow=size)) | |
seqs <- lapply(1L:size, seq.int) | |
seqs0 <- lapply(seqs, `-`, 1L) | |
seq.rev <- rev(seq.int(size)) | |
seq.rev0 <- seq.rev - 1L | |
ones <- rep(1L, size) | |
weights.cs <- cumsum(weights) | |
pool.lu <- c(1L, head(weights.cs, -1L) + 1L) | |
# Loop through the `n` requested samples | |
for(i in seq.int(n)) { | |
# Check for completeness, and remove values that would lead to incomplete | |
# pools. We only remove padded values so `valid.unique` is unchanged | |
if(complete.only) { | |
if(max.allowed) { | |
rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]]) | |
for(j in rev(rem.pow)) { | |
to.rem <- which(pad.imp == max.allowed) | |
free[to.rem] <- FALSE | |
max.allowed <- max.allowed - 1L | |
} | |
if(!max.allowed && n - i >= valid.unique) | |
stop( | |
"Logic Error: pool is not large enough to support complete samples" | |
) } } | |
vals.enc <- vals.enc.init[which(free)] | |
if(!(pool.len.left <- length(vals.enc))) break | |
val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc | |
# Figure out how many trailing zeros our number has (recall, this is | |
# encoded in the least significant bits of our number); note, zeros is a bit | |
# misleading, it means: "how many digits after initial digit are explicilty | |
# specied". The name `zeros` comes from numbers like `1` that would need to | |
# add zeros to be specified (e.g. `1000`, which has three zeros) | |
val <- val.enc %/% zero.pad | |
enc <- val.enc %% zero.pad | |
weight <- weights[[val + 1L]] | |
zeros <- size - weight + enc | |
pad <- size0 - zeros | |
res[i, ] <- c(val, pad) | |
# Based on number of zeros, we can figure out up to what value we need | |
# to disqualify (NOTE: different than withbin, here we get the next value | |
# greater than our range because `free` is always same size) | |
disq.hi.enc <- as.integer((val + 2L ^ pad)) * zero.pad | |
# Incremental disqualification of smaller patterns by computing the | |
# decimal value from a sequantially truncated bit matrix | |
disq.loc.extra <- if(zeros) { | |
seq.z <- seqs[[zeros]] | |
disqual.more.tmp <- as.integer( | |
ones[seq.z] %*% ( | |
as.integer(intToBits(val)[seq.rev])[seq.z] * | |
zero.mx[seq.z, seq.z, drop=F] | |
) ) | |
ws <- weights[disqual.more.tmp + 1L] | |
offset <- seqs0[[zeros]] + ws - size | |
disq.loc <- pool.lu[disqual.more.tmp + 1L] + offset | |
disqualifiable <- which(disqual.more.tmp < val) | |
valid.unique <- valid.unique - sum(!(ws - offset - 1L)[disqualifiable]) | |
unique.default(disq.loc[disqualifiable]) | |
} else integer() | |
# Find values to remove, first with the range of values disqualified by our | |
# pick | |
lo <- val.enc %/% zero.pad + 1L | |
hi <- disq.hi.enc %/% zero.pad + 1L | |
free[ | |
seq.int( | |
from=pool.lu[[lo]], | |
to=max(pool.lu[[lo]], pool.lu[[hi - 1L]] + weights[[hi - 1L]] - 1L) | |
) ] <- FALSE | |
# Now remove any parent values | |
free[disq.loc.extra] <- FALSE | |
valid.unique <- valid.unique - 2 ^ pad # Not certain this works cleanly in this version | |
} | |
# Now convert to binary representation; note we assume ints are 32 bits | |
res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ] | |
substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment