Skip to content

Instantly share code, notes, and snippets.

@paucoma
Last active February 20, 2020 16:38
Show Gist options
  • Save paucoma/36343a189ffd62aefa7ee87d71b18fcf to your computer and use it in GitHub Desktop.
Save paucoma/36343a189ffd62aefa7ee87d71b18fcf to your computer and use it in GitHub Desktop.
Python Script to simulate Population Estimation using Capture Recapture Method
# Script to simulate Population Estimation function by paui
# Inspired to investigate after watching Matts (standupmaths) video:
# How to estimate a population using statisticians
# https://www.youtube.com/watch?v=MTmnVBJ9gCI
#
# Capture-Recapture Method
# Single-Shot version
# eN - population estimate
# M - number of marked samples before re-capture
# C - recapture sample size
# R - number of marked samples found in re-capture
#
# eN = M*C/R , to avoid R = 0 case --> eN = ()(M+1)*(C+1)/(R+1))-1
#
# Multiple iterations Method
#
# eN = \Sum_1*N {M[i]*C[i]} / \Sum_1^N {R[i]}
#
# M increments with each new capture, unmarked samples get marked
#
# Variance estimate formula from: https://onlinecourses.science.psu.edu/stat506/node/58/
# Variance Estimate:
# var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r])))
# To deal with the case when x = 0 and we do not want to estimate τ by infinity, a modified estimator for τ is:
# (M[r]+1)*(C[r]+1)/()
# Capture probability estimate from: http://ag.unr.edu/sedinger/Courses/ERS488-688/lectures/openjolly-seber.pdf
# Matts specific numbers were:
# M=64, C=67, R=21 --> eN=204 , Var=916 --> stddev = 30
# completely off, so probably
# - the sampling wasn't independent enough
# - the population wasn't a closed set durring the sampling periods
# ------------------------------
# --- Variables to play with ---
myPopulationSize = 47500 # must be integer > 0
# Definition of (re)Captures "C[]"
# --------------------------------
# Option A - Formulated sample size
if (1):
#lets define total number of rounds to simulate
nRounds = 20
#lets define our capture size for each recapture round
C=[None]*nRounds
for i in range(0,nRounds) :
C[i]= 40
else:
# Option B - Directly specified [set above if to false to enter here]
C=[64,32,16,8]
nRounds = len(C)
# The first element in C array is the initial sample size captured and marked
# --- End Variables ---
# ---------------------
#from statistics import median
import random
import math
# empty set to start with
setN = set()
# add elements to the set
for i in range(1,myPopulationSize):
setN.add(i)
setC=[None]*nRounds
setM=[None]*nRounds
M=[None]*nRounds
R=[None]*nRounds
p=[None]*nRounds
#LP_eN=[None]*nRounds
#SeLP_eN=[None]*nRounds
eN=[None]*nRounds
stdev_eN=[None]*nRounds
confLow_eN=[None]*nRounds
confHigh_eN=[None]*nRounds
boundLow_eN=[None]*nRounds
boundHigh_eN=[None]*nRounds
rN=[]
print("Round\tMarked \t p \t\t est \t pest \tstdev \t\tLow \tHigh")
#Lets get our first capture
setC[0]=set(random.sample(setN,C[0]))
setM[0]=setC[0] #we mark all in round 1
rN.clear()
for r in range(1,len(C)):
M[r] = len(setM[r-1]) #number of marked at the begining of round r
setC[r] = set(random.sample(setN,C[r])) # Generating the Recapture round r
R[r] = len(set(setM[r-1]&setC[r])) # number of recaptures round r
setM[r] = set(setM[r-1]|setC[r]) # we mark the new finds in capture round r
p[r] = round(1.0*R[r]/M[r],9) #capture probability of round r
#Lets calculate our population estimate
rN.append(r)
topSum = botSum = 0
#for i in rN :
# Lincoln-Petersen Estimator
# topSum = topSum + (M[i])*(C[i])
# botSum = botSum + (R[i]+1E-6)#1E-6 for zero case
#LP_eN[r] = round(1.0*topSum/botSum) #population estimation on round r
# 20/02/19
#for i in rN :
# Seber 1982 evolution of Lincoln-Petersen method (also avoids R[i] = 0 case)
# topSum = topSum + (M[i]+1)*(C[i]+1)
# botSum = botSum + (R[i]+1)
#SeLP_eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
# 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
if p[r] > 0 :
for i in rN :
# Lincoln-Petersen Estimator
topSum = topSum + (M[i])*(C[i])
botSum = botSum + (R[i])
eN[r] = round(1.0*topSum/botSum) #population estimation on round r
#eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
#var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r])))
#20/02/19: added the +1 below to avoid zero case error, mostly insignificant in comparison to R[r]^3
#var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]+1)))
#20/02/20: variance estimation "Error" from original approximation is 50% undershoot if R=1;12%,R=2;4%,R=3
# Modifying to overshoot, by how much depends on how many R[r] s' you add to both top bottom and +1 or +0.1 ,...
# Play with relative significance of "R"'s to zero eliminating constant "+1,+0.1,..."
# we want the zero case eliminator (+0.1) to influence as little as possible
# {...}*(R[r]+0.01)/((R[r]^3)*R[r]+0.01) -> 0%,R=1; +0.4%,R=2; +0.3%,R=3; +0.2%,R=4
# var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r])*(R[r]+0.01))/((R[r]*R[r]*R[r])*R[r]+0.01)))
# 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
# simplified math.sqrt(var_eN[r]) -> stdev_eN[]
stdev_eN[r] = round(math.sqrt(abs((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]))))
#Confidence Level 0.70 0.75 0.80 0.85 0.90 0.92 0.95 0.96 0.98 0.99
#z 1.04 1.15 1.28 1.44 1.645 0.75 1.96 2.05 2.33 2.58
confLow_eN[r] = 1.96*math.sqrt(((1-R[r]/M[r])*(R[r]/C[r])*(1-R[r]/C[r]))/(C[r]-1))+1/(2*C[r])
confHigh_eN[r] = R[r]/C[r]-confLow_eN[r]
confLow_eN[r] = R[r]/C[r]+confLow_eN[r]
boundLow_eN[r] = round(M[r]/confLow_eN[r])
boundHigh_eN[r] = round(M[r]/confHigh_eN[r])
#std deviation =sqrt(Var)
#print(" %d\t %d \t %1.3f \t %d \t %d \t%3.2f \t\t%3.2f\t%3.2f"%(r,M[r],p[r],eN[r],round(C[r]/(p[r]+1E-6)),stdev_eN[r],M[r]/confLow_eN[r],M[r]/confHigh_eN[r]))
#lets define when an estimate is valid... conditions:
# 2*stddev -> 95% of cases contemplated
# z= 1.96 --> 0.95 confidence level for boundries
# 1. is estimate > 2*stddev ?
# 2. is estimate between boundries ?
# ?3. is pest < est ? <--- maybe --> in large numbers always falls behind
#if (eN[r]>(stdev_eN[r])) & (eN[r]>boundLow_eN[r]) & (eN[r]<boundHigh_eN[r]):
#boundries for large numbers doesnt seem to work out
if (eN[r]>(stdev_eN[r])) :
print(" %d\t %d \t %1.4f \t %d \t %d \t%d \t\t%d\t%d"%(r,M[r],p[r],eN[r],round(C[r]/(p[r])),stdev_eN[r],boundLow_eN[r],boundHigh_eN[r]))
#eN.remove(None) #None is not accepted in the function median
#print("median of last %d rounds: %d"%(int(nRounds/2),median(eN[int(nRounds/-2):])))
print("real Popualtion %d"%len(setN))
@LuisDAlfaro
Copy link

LuisDAlfaro commented Feb 19, 2020 via email

@paucoma
Copy link
Author

paucoma commented Feb 19, 2020

I had a little play around with the code and updated it to better handle divide by zero cases without affecting significantly the estimate generated.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment