Skip to content

Instantly share code, notes, and snippets.

@jslefche
Last active February 17, 2021 14:29
Show Gist options
  • Save jslefche/4f34c8f0d449bb50c27c87f3deacfec5 to your computer and use it in GitHub Desktop.
Save jslefche/4f34c8f0d449bb50c27c87f3deacfec5 to your computer and use it in GitHub Desktop.
covstop: A function for coverage-based stopping of biodiversity sampling

covstop: Coverage-based stopping for biodiversity sampling

A function that takes a sample (rows)-by-species (columns) community matrix and uses coverage-based subsampling to identify the number of samples of individuals required to achieve a given level of coverage of total biodiversity.

From: Chao, Anne, and Lou Jost. "Coverage‐based rarefaction and extrapolation: standardizing samples by completeness rather than size." Ecology 93.12 (2012): 2533-2547.

Example

# generate fake community matrix
vec <- sample(0:100, 100, replace = T)

vec[sample(1:100, 50)] <- 0

mat <- matrix(vec, nrow = 10)

# determine number of samples required
covstop(mat, type = "samples")

# determine number of samples required for 99% coverage
covstop(mat, Cn = 0.98)

# determine number of individuals required
covstop(mat, type = "individuals")
#' covstop: a function to implement coverage-based stopping
#'
#' @param comm a site (rows)-by-species (columns) data.frame
#' @param type whether the output should be in number of individuals ("individuals") or in
#' number of sampling units ("samples")
#' @param Cn desired level of coverage
#' @param se whether bootstrapped standard errors should be returned
#' @param knots a number of knots of computation, default is 40
#'
#' @return the number of individuals or samples required to meet the desired level of coverage
#'
#' @examples
#' comm <- sample(100)
#'
#' comm[sample(1:100, 50, replace = F)] <- 0
#'
#' comm <- matrix(comm, nrow = 10)
#'
#' covstop(comm)
#'
covstop <- function(comm, type = "samples", Cn = 0.90, se = FALSE, knots = 40) {
if(Cn > 1) stop("Coverage level must be between (0, 1)")
if(type == "samples") comm <- data.frame(N = nrow(comm), t(colSums(comm > 0)))
comm <- split(comm, 1:nrow(comm))
sc <- sapply(comm, function(Spec) {
Spec <- as.numeric(Spec)
endpoint <- 2 * sum(Spec)
if(type == "individuals") {
n <- sum(Spec)
if (endpoint <= n) {
m <- floor(seq(1, endpoint, length = floor(knots)))
} else {
m <- c(floor(seq(1, sum(Spec) - 1, length.out = floor(knots/2) - 1)), sum(Spec), floor(seq(sum(Spec) + 1, to = endpoint, length.out = floor(knots/2))))
}
m <- c(1, m[-1])
C <- Chat.Ind(Spec, m)
# if(se == TRUE) {
#
# Prob.hat <- EstiBootComm.Ind(Spec)
#
# Abun.Mat <- rmultinom(nboot, n, Prob.hat)
#
# error.C <- qnorm(1 - (1 - 0.95) / 2) * apply(apply(Abun.Mat, 2, function(x) Chat.Ind(x, m)), 1, sd, na.rm = TRUE)
#
# }
} else if(type == "samples") {
nT <- Spec[1]
if(endpoint <= nT) {
m <- floor(seq(1, endpoint, length.out = floor(knots)))
} else {
m <- c(floor(seq(1, nT - 1, length.out = floor(knots/2) - 1)), nT, floor(seq(nT + 1, to = endpoint, length.out = floor(knots/2))))
}
m <- c(1, m[-1])
C <- Chat.Sam(Spec, m)
}
m[which.min(C <= Cn)]
} )
# if(se == TRUE)
#
# paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", as.numeric(sc[which.max(sc)]), "+/-", as.numeric(error.C))
x <- as.numeric(sc[which.max(sc)])
# print(paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", x))
x
}
### helper functions from iNEXT
### credit: T.C. Hsieh
Chat.Ind <- function (x, m) {
x <- x[x > 0]
n <- sum(x)
f1 <- sum(x == 1)
f2 <- sum(x == 2)
f0.hat <- ifelse(f2 == 0, (n - 1)/n * f1 * (f1 - 1)/2, (n -
1)/n * f1^2/2/f2)
A <- ifelse(f1 > 0, n * f0.hat/(n * f0.hat + f1), 1)
Sub <- function(m) {
if (m < n) {
xx <- x[(n - x) >= m]
out <- 1 - sum(xx/n * exp(lgamma(n - xx + 1) - lgamma(n -
xx - m + 1) - lgamma(n) + lgamma(n - m)))
}
if (m == n)
out <- 1 - f1/n * A
if (m > n)
out <- 1 - f1/n * A^(m - n + 1)
out
}
sapply(m, Sub)
}
Chat.Sam <- function(x, t){
nT <- x[1]
y <- x[-1]
y <- y[y>0]
U <- sum(y)
Q1 <- sum(y == 1)
Q2 <- sum(y == 2)
Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2) #estimation of unseen species via Chao2
A <- ifelse(Q1>0, nT*Q0.hat/(nT*Q0.hat+Q1), 1)
Sub <- function(t){
if(t < nT) {
yy <- y[(nT-y)>=t]
out <- 1 - sum(yy / U * exp(lgamma(nT-yy+1)-lgamma(nT-yy-t+1)-lgamma(nT)+lgamma(nT-t)))
}
#if(t < nT) out <- 1 - sum(y / U * exp(lchoose(nT - y, t) - lchoose(nT - 1, t)))
if(t == nT) out <- 1 - Q1 / U * A
if(t > nT) out <- 1 - Q1 / U * A^(t - nT + 1)
out
}
sapply(t, Sub)
}
EstiBootComm.Ind <- function(Spec)
{
Sobs <- sum(Spec > 0) #observed species
n <- sum(Spec) #sample size
f1 <- sum(Spec == 1) #singleton
f2 <- sum(Spec == 2) #doubleton
f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2) #estimation of unseen species via Chao1
A <- ifelse(f1>0, n*f0.hat/(n*f0.hat+f1), 1)
a <- f1/n*A
b <- sum(Spec / n * (1 - Spec / n) ^ n)
if(f0.hat==0){
w <- 0
if(sum(Spec>0)==1){
warning("This site has only one species. Estimation is not robust.")
}
}else{
w <- a / b #adjusted factor for rare species in the sample
}
Prob.hat <- Spec / n * (1 - w * (1 - Spec / n) ^ n) #estimation of relative abundance of observed species in the sample
Prob.hat.Unse <- rep(a/ceiling(f0.hat), ceiling(f0.hat)) #estimation of relative abundance of unseen species in the sample
return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE)) #Output: a vector of estimated relative abundance
}
EstiBootComm.Sam <- function(Spec)
{
nT <- Spec[1]
Spec <- Spec[-1]
Sobs <- sum(Spec > 0) #observed species
Q1 <- sum(Spec == 1) #singleton
Q2 <- sum(Spec == 2) #doubleton
Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2) #estimation of unseen species via Chao2
A <- ifelse(Q1>0, nT*Q0.hat/(nT*Q0.hat+Q1), 1)
a <- Q1/nT*A
b <- sum(Spec / nT * (1 - Spec / nT) ^ nT)
if(Q0.hat==0){
w <- 0
if(sum(Spec>0)==1){
warning("This site has only one species. Estimation is not robust.")
}
}else{
w <- a / b #adjusted factor for rare species in the sample
}
Prob.hat <- Spec / nT * (1 - w * (1 - Spec / nT) ^ nT) #estimation of detection probability of observed species in the sample
Prob.hat.Unse <- rep(a/ceiling(Q0.hat), ceiling(Q0.hat)) #estimation of detection probability of unseen species in the sample
return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE)) #Output: a vector of estimated detection probability
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment