Last active
August 31, 2023 10:28
-
-
Save tslumley/16bdc33a5ccfc05d0b8e5d690cb397fd to your computer and use it in GitHub Desktop.
Modify sample.int with new options for PPS with replacement sampling that have the specified inclusion probabilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## New sample.int | |
sample_int<-function(n, size=NULL, replace=FALSE, prob=NULL, | |
useHash=(n > 1e+07 && !replace && is.null(prob) && (!is.null(size)) && size <= n/2), | |
method=c("sequential","marginal","poisson")){ | |
if (replace || is.null(prob)){ | |
if (is.null(size)){ | |
size<-n | |
} | |
} else{ | |
if (is.null(size)) | |
size<-sum(prob) | |
} | |
stopifnot(length(n) == 1L) | |
if (useHash) { | |
stopifnot(is.null(prob), !replace) | |
.Internal(sample2(n, size)) | |
} | |
else if (!is.null(prob) && !replace){ | |
if (length(prob)!=n) stop("incorrect number of probabilities") | |
method<-match.arg(method) | |
switch(method, | |
sequential=sample.int(n, size=size, replace=replace,prob=prob, useHash=FALSE), | |
marginal=sample_pps(n, size, prob), | |
poisson=sample(seq.int(1,n)[runif(n)<=prob*size/sum(prob)])) | |
} else { | |
## Will be the .Internal from sample.int | |
sample.int(n, size=size ,replace=replace, prob=prob, useHash=FALSE) | |
} | |
} | |
sample_pps<-function(n, size, prob){ | |
eps<-1e-5 | |
s<-sum(prob) | |
sums_to_one <- abs((s-1)/(s+eps))<eps | |
sums_to_int <- abs((s-round(s))/(s+eps))<eps | |
if (is.null(size)){ | |
if(!sums_to_int) stop("sum(prob) must be an integer") | |
size<-round(s) | |
} else{ | |
size_is_sum = abs((size-sum(prob))/(size+eps))<eps | |
size_is_int = abs((size-round(size))/(size+eps))<eps | |
if (!size_is_int) stop("size must be NULL or an integer") | |
if (sums_to_one && !size_is_sum){ | |
warning("rescaling prob, which changes inclusion probabilities") | |
prob<-inclusion_probabilities(prob*size, size) | |
} else if (sums_to_int && !size_is_sum){ | |
warning("sum(prob) is not equal to size or 1, rescaling") | |
prob<-inclusion_probabilities(prob*size/s,size) | |
} | |
rval<-seq_len(n)[sampling::UPtille(prob)!=0] ##sample_brewer(prob) | |
if (size>1) | |
sample(rval) | |
else | |
rval | |
} | |
} | |
## taken from sampling::inclusionprobabilities by Alina Matei and Yves Tille (GPL>2) | |
inclusion_probabilities<-function (a, n) | |
{ | |
if (!is.vector(a)) | |
a = as.vector(a) | |
nnull = length(a[a == 0]) | |
nneg = length(a[a < 0]) | |
if (nnull > 0) | |
warning("there are zero values in the initial vector a\n") | |
if (nneg > 0) { | |
warning("there are ", nneg, " negative value(s) shifted to zero\n") | |
a[(a < 0)] = 0 | |
} | |
if (identical(a, rep(0, length(a)))) | |
pik1 = a | |
else { | |
pik1 = n * a/sum(a) | |
pik = pik1[pik1 > 0] | |
list1 = pik1 > 0 | |
list = pik >= 1 | |
l = length(list[list == TRUE]) | |
if (l > 0) { | |
l1 = 0 | |
while (l != l1) { | |
x = pik[!list] | |
x = x/sum(x) | |
pik[!list] = (n - l) * x | |
pik[list] = 1 | |
l1 = l | |
list = (pik >= 1) | |
l = length(list[list == TRUE]) | |
} | |
pik1[list1] = pik | |
} | |
} | |
pik1 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% File src/library/base/man/sample.Rd | |
% Part of the R package, https://www.R-project.org | |
% Copyright 1995-2023 R Core Team | |
% Distributed under GPL 2 or later | |
\name{sample} | |
\alias{sample} | |
\alias{sample.int} | |
\title{Random Samples and Permutations} | |
\description{ | |
\code{sample} takes a sample of the specified size from the elements | |
of \code{x} using either with or without replacement. | |
} | |
\usage{ | |
sample(x, size, replace = FALSE, prob = NULL) | |
sample.int(n, size = NULL, replace = FALSE, prob = NULL, | |
useHash = (n > 1e+07 && !replace && is.null(prob) && size <= n/2)) | |
} | |
\arguments{ | |
\item{x}{either a vector of one or more elements from which to choose, | |
or a positive integer. See \sQuote{Details.}} | |
\item{n}{a positive number, the number of items to choose from. See | |
\sQuote{Details.}} | |
\item{size}{a non-negative integer giving the number of items to | |
choose. The default is \code{n} except for | |
unequal-probability sampling without replacement, where it is | |
\code{sum(prob)}} | |
\item{replace}{should sampling be with replacement?} | |
\item{prob}{a vector of probability weights for obtaining the elements | |
of the vector being sampled.} | |
\item{useHash}{\code{\link{logical}} indicating if the hash-version of | |
the algorithm should be used. Can only be used for \code{replace = | |
FALSE}, \code{prob = NULL}, and \code{size <= n/2}, and really | |
should be used for large \code{n}, as \code{useHash=FALSE} will use | |
memory proportional to \code{n}.} | |
\item{method}{ | |
When \code{replace=FALSE} and \code{prob} is not \code{NULL}, the | |
method to use for sampling, see Details below. | |
} | |
} | |
\details{ | |
If \code{x} has length 1, is numeric (in the sense of | |
\code{\link{is.numeric}}) and \code{x >= 1}, sampling \emph{via} | |
\code{sample} takes place from \code{1:x}. \emph{Note} that this | |
convenience feature may lead to undesired behaviour when \code{x} is | |
of varying length in calls such as \code{sample(x)}. See the examples. | |
Otherwise \code{x} can be any \R object for which \code{length} and | |
subsetting by integers make sense: S3 or S4 methods for these | |
operations will be dispatched as appropriate. | |
For \code{sample} the default for \code{size} is the number of items | |
inferred from the first argument, so that \code{sample(x)} generates a | |
random permutation of the elements of \code{x} (or \code{1:x}). | |
It is allowed to ask for \code{size = 0} samples with \code{n = 0} or | |
a length-zero \code{x}, but otherwise \code{n > 0} or positive | |
\code{length(x)} is required. | |
Non-integer positive numerical values of \code{n} or \code{x} will be | |
truncated to the next smallest integer, which has to be no larger than | |
\code{\link{.Machine}$integer.max}. | |
The optional \code{prob} argument can be used to give a vector of | |
weights for obtaining the elements of the vector being sampled. They | |
need not sum to one, but they should be non-negative and not all zero. | |
If \code{replace} is true, Walker's alias method (Ripley, 1987) is | |
used when there are more than 200 reasonably probable values: this | |
gives results incompatible with those from \R < 2.2.0. | |
If \code{replace} is false and \code{prob} is supplied there are three | |
options, controlled by \code{method}. The number of nonzero weights | |
must be at least \code{size} in this case and the weights should sum | |
to the desired sample size. | |
All three of the methods have disadvantages. The default, | |
compatible with \R< 4.4.0 is sequential sampling, that | |
is, the probability of choosing the next item is proportional to the | |
weights amongst the remaining items. Sequential sampling is fast, | |
\emph{but the probability of being sampled is not equal or | |
proportional to \code{prob}}. Using \code{"marginal"} | |
draws a sample so that the inclusion probabilities are the values of | |
\code{prob} and by default \code{size=sum(prob)}. It uses Algorithm | |
6.11 from Tille (2006). This is much slower for large \code{n} | |
than the other methods. Finally, \code{"poisson"} does Poisson | |
sampling, where the inclusion probabilities are the values of | |
\code{prob} but the \emph{sample size is random} with mean \code{size} | |
rather than being fixed at \code{size}; it is most useful when | |
\code{size} is large and the variability is thus relatively small. | |
\code{sample.int} is a bare interface in which both \code{n} and | |
\code{size} must be supplied as integers. | |
Argument \code{n} can be larger than the largest integer of | |
type \code{integer}, up to the largest representable integer in type | |
\code{double}. Only uniform sampling is supported. Two | |
random numbers are used to ensure uniform sampling of large integers. | |
} | |
\value{ | |
For \code{sample} a vector of length \code{size} with elements | |
drawn from either \code{x} or from the integers \code{1:x}. | |
For \code{sample.int}, an integer vector of length \code{size} with | |
elements from \code{1:n}, or a double vector if | |
\eqn{n \ge 2^{31}}{n >= 2^31}. | |
} | |
\references{ | |
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) | |
\emph{The New S Language}. | |
Wadsworth & Brooks/Cole. | |
Ripley, B. D. (1987) \emph{Stochastic Simulation}. Wiley. | |
Tille, Y (2006) \emph{Sampling Algorithms} Springer | |
} | |
\seealso{ | |
\code{\link{RNGkind}(sample.kind = ..)} about random number generation, | |
notably the change of \code{sample()} results with \R version 3.6.0. | |
CRAN package \CRANpkg{sampling} for other methods of weighted sampling | |
without replacement. | |
} | |
\examples{ | |
x <- 1:12 | |
# a random permutation | |
sample(x) | |
# bootstrap resampling -- only if length(x) > 1 ! | |
sample(x, replace = TRUE) | |
# 100 Bernoulli trials | |
sample(c(0,1), 100, replace = TRUE) | |
## More careful bootstrapping -- Consider this when using sample() | |
## programmatically (i.e., in your function or simulation)! | |
# sample()'s surprise -- example | |
x <- 1:10 | |
sample(x[x > 8]) # length 2 | |
sample(x[x > 9]) # oops -- length 10! | |
sample(x[x > 10]) # length 0 | |
## safer version: | |
resample <- function(x, ...) x[sample.int(length(x), ...)] | |
resample(x[x > 8]) # length 2 | |
resample(x[x > 9]) # length 1 | |
resample(x[x > 10]) # length 0 | |
## R 3.0.0 and later | |
sample.int(1e10, 12, replace = TRUE) | |
sample.int(1e10, 12) # not that there is much chance of duplicates | |
## R 4.4.0 and later | |
### Sequential sampling does not give the specified inclusion probability | |
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3), | |
replace=FALSE, method="sequential"))) | |
### marginal sampling does give the specified inclusion probability | |
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3), | |
replace=FALSE, method="marginal"))) | |
} | |
\keyword{distribution} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment