|
|
|
prepare_ebird_2018_id <- function() { |
|
ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip" |
|
# unfortunately, the eBird URL no longer work, but, |
|
# using a time machine, we went back in time and republished data via Zenodo from 2018 |
|
ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip" |
|
|
|
ebird_data_id <- contentid::register(ebird_data_location) |
|
ebird_data_id |
|
} |
|
|
|
prepare_ebird_2019_id <- function() { |
|
ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip" |
|
# unfortunately, the eBird URL no longer work, but, |
|
# using a time machine, we went back in time and republished data via Zenodo from 2019 |
|
ebird_data_location <- "https://zenodo.org/record/3858443/files/dwca-1.0.zip" |
|
|
|
ebird_data_id <- contentid::register(ebird_data_location) |
|
ebird_data_id |
|
} |
|
|
|
count_geese <- function(read_con) { |
|
open(read_con, 'rb') |
|
line_batch_size <- 1000 |
|
geese_count <- 0 |
|
total_count <- 0 |
|
has_lines <- TRUE |
|
while (has_lines) { |
|
lines_read <- readLines(con = read_con, n = line_batch_size, skipNul = TRUE) |
|
has_lines <- length(lines_read) > 0 |
|
geese_count_increment <- length(lines_read[grepl("Branta canadensis", lines_read)]) |
|
geese_count <- geese_count + geese_count_increment |
|
total_count <- total_count + length(lines_read) |
|
if (total_count %% (line_batch_size * 1000) == 0) { |
|
cat(".") |
|
} |
|
} |
|
close(read_con) |
|
cat("done\n") |
|
list(with_geese=geese_count, total=total_count) |
|
} |
|
|
|
# complicated algorithm that counts lines |
|
# with occurrences of Canadian Geese (Branta canadensis) |
|
# using R's unz (see also https://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html) |
|
# |
|
# note that this fails for eBird, because "unz" cuts off the stream |
|
# after 4GB (https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file) |
|
# |
|
count_geese_unz <- function(data_location) { |
|
read_con <- unz(data_location, "occurrence.txt") |
|
count_geese(read_con) |
|
} |
|
|
|
# algorithm that counts lines |
|
# with occurrences of Canadian Geese (Branta canadensis) |
|
# using linux unzip (see also https://linux.die.net/man/1/unzip) |
|
|
|
count_geese_linux_unzip <- function(data_location) { |
|
read_con <- pipe(paste("unzip -p", data_location, "occurrence.txt")) |
|
count_geese(read_con) |
|
} |
|
|
|
do_research <- function(data_id, algorithm) { |
|
# resolve the data_id to current data locations |
|
data_location <- contentid::resolve(data_id) |
|
algorithm(data_location) |
|
} |
|
|
|
reproduce_results <- function(algorithm) { |
|
ebird_2018 <- do_research("hash://sha256/29d30b566f924355a383b13cd48c3aa239d42cba0a55f4ccfc2930289b88b43c", algorithm = algorithm) |
|
|
|
# assumption from: |
|
# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l |
|
if (geese_count_ebird_2018$with_geese != 4613847) { |
|
message("failed to reproduce geese count of eBird 2018") |
|
} |
|
if (ebird_2018$total != 361429889) { |
|
message("the number of lines in eBird 2018 do not add up") |
|
} |
|
|
|
ebird_2019 <- do_research("hash://sha256/ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4", algorithm = algorithm) |
|
# assumption from: |
|
# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l |
|
if (ebird_2019$with_geese != 7066919) { |
|
message("failed to reproduce geese count of eBird 2019") |
|
} |
|
|
|
if (ebird_2019$total != 561852543) { |
|
message("the number of lines in eBird 2019 do not add up") |
|
} |
|
} |
|
|
|
reproduce_all <- function() { |
|
reproduce_results(algorithm = count_geese_unz) |
|
reproduce_results(algorithm = count_geese_linux_unzip) |
|
} |
@cboettig Please see email and this outline for talk.