Skip to content

Instantly share code, notes, and snippets.

@jslefche
Last active July 10, 2019 18:05
Show Gist options
  • Save jslefche/2079ddf272f6c725812d88d50f200c1a to your computer and use it in GitHub Desktop.
Save jslefche/2079ddf272f6c725812d88d50f200c1a to your computer and use it in GitHub Desktop.
Faster coverage-based rarefaction

Faster coverage-based subsampling

A revised function from the iNEXT package (https://github.com/JohnsonHsieh/iNEXT) that calculates coverage-based rarefaction from abundance data. Instead of computing all orders of q it allows the user to specify a single order, and additionally omits bootstrapping of standard errors. The resulting function is much, much faster.

EXAMPLE

# Create fake community-by-species abundance matrix
mat <- sample(1:50, 100, replace = T)

mat[sample(1:100, 25, replace = T)] <- 0

mat <- matrix(mat)

# Apply new function
estimateD.new(mat) # default is richness

# Try with iNEXT
iNEXT:::estimateD(t(mat), base = "coverage")
#' estimateD.new = faster coverage-based rarefaction
#' @param x a community (rows) by species (cols) matrix
#' @param q the level of q to be used for calculating diversity (default is richness)
#' @param level a user-defined level of coverage
#' @return a data.frame with the sample size, method of estimation, sample coverage, and estimated diversity
#'
estimateD.new <- function(x, q = 0, level=NULL){
C <- level
if(is.null(C))
C <- min(unlist(apply(x, 1, function(x) Chat.Ind(x, sum(x)))))
do.call(rbind, apply(x, 1, function(x) invChat.Ind(x, q, C)))
}
#' helper functions
Chat.Ind <- function(x, m){
x <- x[x>0]
n <- sum(x)
f1 <- sum(x == 1)
f2 <- sum(x == 2)
f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2) #estimation of unseen species via Chao1
A <- ifelse(f1>0, n*f0.hat/(n*f0.hat+f1), 1)
Sub <- function(m){
#if(m < n) out <- 1-sum(x / n * exp(lchoose(n - x, m)-lchoose(n - 1, m)))
if(m < n) {
xx <- x[(n-x)>=m]
out <- 1-sum(xx / n * exp(lgamma(n-xx+1)-lgamma(n-xx-m+1)-lgamma(n)+lgamma(n-m)))
}
if(m == n) out <- 1-f1/n*A
if(m > n) out <- 1-f1/n*A^(m-n+1)
out
}
sapply(m, Sub)
}
invChat.Ind <- function(x, q, C){
m <- NULL # no visible binding for global variable 'm'
n <- sum(x)
refC <- Chat.Ind(x,n)
f <- function(m, C) abs(Chat.Ind(x,m)-C)
if(refC > C) {
opt <- optimize(f, C=C, lower=0, upper=sum(x))
mm <- opt$minimum
mm <- round(mm)
}
if(refC <= C) {
f1 <- sum(x==1)
f2 <- sum(x==2)
if(f1>0 & f2>0){A <- (n-1)*f1/((n-1)*f1+2*f2)}
if(f1>1 & f2==0){A <- (n-1)*(f1-1)/((n-1)*(f1-1)+2)}
if(f1==1 & f2==0){A <- 1}
if(f1==0 & f2==0){A <- 1}
mm <- (log(n/f1)+log(1-C))/log(A)-1
mm <- n+mm
mm <- round(mm)
}
if(mm > 2*n) warning("The maximum size of the extrapolation exceeds double reference sample size, the results for q = 0 may be subject to large prediction bias.")
method <- ifelse(mm<n, "interpolated", ifelse(mm==n, "observed", "extrapolated"))
out <- data.frame(m=mm,
method=method,
coverage=round(Chat.Ind(x,mm),3),
diversity=round(Dqhat.Ind(x,q,mm),3)
)
out
}
Dqhat.Ind <- function(x, q, m){
x <- x[x > 0]
n <- sum(x)
fk.hat <- function(x, m){
x <- x[x > 0]
n <- sum(x)
if(m <= n){
Sub <- function(k) sum(exp(lchoose(x, k) + lchoose(n - x, m -k) - lchoose(n, m)))
sapply(1:m, Sub)
}
else {
f1 <- sum(x == 1)
f2 <- sum(x == 2)
A <- ifelse(f2 > 0, (n-1)*f1/((n-1)*f1+2*f2), (n-1)*f1/((n-1)*f1+2))
C.hat <- 1 - f1 / n * A
p.hat <- x / n * C.hat
Sub <- function(k) sum((choose(m, k) * p.hat^k * (1 - p.hat)^(m - k)) / (1 - (1 - p.hat)^n))
sapply(1:m, Sub)
}
}
D0.hat <- function(x, m){
x <- x[x > 0]
n <- sum(x)
Sub <- function(m){
if(m <= n){
Fun <- function(x){
if(x <= (n - m)) exp(lgamma(n - x + 1) + lgamma(n - m + 1) - lgamma(n - x - m + 1) - lgamma(n + 1))
else 0
}
sum(1 - sapply(x, Fun))
}
else {
Sobs <- sum(x > 0)
f1 <- sum(x == 1)
f2 <- sum(x == 2)
f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2) #estimation of unseen species via Chao1
A <- n*f0.hat/(n*f0.hat+f1)
ifelse(f1 ==0, Sobs ,Sobs + f0.hat * (1 - A ^ (m - n)))
}
}
sapply(m, Sub)
}
D1.hat <- function(x, m){
x <- x[x > 0]
n <- sum(x)
Sub <- function(m){
if(m < n){
k <- 1:m
exp(-sum(k / m * log(k / m) * fk.hat(x, m)))
}
else{
#UE=sum(sapply(1:(n-1),function(k){sum(1/k*x/n*exp(lchoose(n-x,k)-lchoose(n-1,k)))}))
UE <- sum(x/n*(digamma(n)-digamma(x)))
f1 <- sum(x == 1)
f2 <- sum(x == 2)
A <- 1 - ifelse(f2 > 0, (n-1)*f1/((n-1)*f1+2*f2), (n-1)*f1/((n-1)*f1+2))
#A=2*sum(x==2)/((n-1)*sum(x==1)+2*sum(x==2))
B <- ifelse(A<1, sum(x==1)/n*(1-A)^(-n+1)*(-log(A)-sum(sapply(1:(n-1),function(k){1/k*(1-A)^k}))), 0)
D.hat <- exp(UE+B)
Dn <- exp(-sum(x / n * log(x / n)))
a <- 1:(n-1)
b <- 1:(n-2)
Da <- exp(-sum(a / (n-1) * log(a / (n-1)) * fk.hat(x, (n-1))))
Db <- exp(-sum(b / (n-2) * log(b / (n-2)) * fk.hat(x, (n-2))))
Dn1 <- ifelse(Da!=Db, Dn + (Dn-Da)^2/(Da-Db), Dn)
b <- ifelse(D.hat>Dn, (Dn1-Dn)/(D.hat-Dn), 0)
# b <- A
ifelse(b!=0, Dn + (D.hat-Dn)*(1-(1-b)^(m-n)), Dn)
}
}
sapply(m, Sub)
}
D2.hat <- function(x, m){
Sub <- function(m) 1 / (1 / m + (1 - 1 / m) * sum(x * (x - 1) / n / (n - 1)))
sapply(m, Sub)
}
Dq.hat <- function(x, m){
Sub <- function(m){
k <- 1:m
sum( (k / m)^q * fk.hat(x, m))^(1 / (1 - q))
}
sapply(m, Sub)
}
if(q == 0) D0.hat(x, m)
else if(q == 1) D1.hat(x, m)
else if(q == 2) D2.hat(x, m)
else Dq.hat(x, m)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment