jhpoelen · May 27, 2020 23:50 · jhpoelen · May 27, 2020
diff --git a/count_geese.R b/count_geese.R

 prepare_ebird_2018_id <- function() {
  ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
  # unfortunately, the eBird URL no longer work, but, 
  # using a time machine, we went back in time and republished data via Zenodo from 2018
  ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

  ebird_data_id <- contentid::register(ebird_data_location)
  ebird_data_id
 }

 prepare_ebird_2019_id <- function() {
  ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
  # unfortunately, the eBird URL no longer work, but, 
  # using a time machine, we went back in time and republished data via Zenodo from 2019
  ebird_data_location <- "https://zenodo.org/record/3858443/files/dwca-1.0.zip"

  ebird_data_id <- contentid::register(ebird_data_location)
  ebird_data_id
 }

 count_geese <- function(read_con) {
  open(read_con, 'rb')
  line_batch_size <- 1000
  geese_count <- 0
  total_count <- 0
  has_lines <- TRUE
  while (has_lines) {
    lines_read <- readLines(con = read_con, n = line_batch_size, skipNul = TRUE)
    has_lines <- length(lines_read) > 0
    geese_count_increment <- length(lines_read[grepl("Branta canadensis", lines_read)])
    geese_count <- geese_count + geese_count_increment
    total_count <- total_count + length(lines_read)
    if (total_count %% (line_batch_size * 1000) == 0) {
      cat(".")
    }
  }
  close(read_con)
  cat("done\n")
  list(with_geese=geese_count, total=total_count)
 }

 # complicated algorithm that counts lines
 # with occurrences of Canadian Geese (Branta canadensis)
 # using R's unz (see also https://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html)
 #
 # note that this fails for eBird, because "unz" cuts off the stream
 # after 4GB (https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file)
 #
 count_geese_unz <- function(data_location) {
  read_con <- unz(data_location, "occurrence.txt")
  count_geese(read_con)
 }

 # algorithm that counts lines
 # with occurrences of Canadian Geese (Branta canadensis)
 # using linux unzip (see also https://linux.die.net/man/1/unzip)

 count_geese_linux_unzip <- function(data_location) {
  read_con <- pipe(paste("unzip -p", data_location, "occurrence.txt")) 
  count_geese(read_con)
 }

 do_research <- function(data_id, algorithm) {
  # resolve the data_id to current data locations
  data_location <- contentid::resolve(data_id)
  algorithm(data_location)
 }

 reproduce_results <- function(algorithm) {
  ebird_2018 <- do_research("hash://sha256/29d30b566f924355a383b13cd48c3aa239d42cba0a55f4ccfc2930289b88b43c", algorithm = algorithm)
 
  # assumption from:
  # unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l
  if (geese_count_ebird_2018$with_geese != 4613847) {
    message("failed to reproduce geese count of eBird 2018")
  }
  if (ebird_2018$total != 361429889) {
    message("the number of lines in eBird 2018 do not add up")
  }

  ebird_2019 <- do_research("hash://sha256/ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4", algorithm = algorithm)
  # assumption from:
  # unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l
  if (ebird_2019$with_geese != 7066919) {
    message("failed to reproduce geese count of eBird 2019")
  }

  if (ebird_2019$total != 561852543) {
    message("the number of lines in eBird 2019 do not add up")
  }
 }

 reproduce_all <- function() {
  reproduce_results(algorithm = count_geese_unz)
  reproduce_results(algorithm = count_geese_linux_unzip)
 }
diff --git a/outline.md b/outline.md

	prepare_ebird_2018_id <- function() {
	ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
	# unfortunately, the eBird URL no longer work, but,
	# using a time machine, we went back in time and republished data via Zenodo from 2018
	ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

	ebird_data_id <- contentid::register(ebird_data_location)
	ebird_data_id
	}

	prepare_ebird_2019_id <- function() {
	ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
	# unfortunately, the eBird URL no longer work, but,
	# using a time machine, we went back in time and republished data via Zenodo from 2019
	ebird_data_location <- "https://zenodo.org/record/3858443/files/dwca-1.0.zip"

	ebird_data_id <- contentid::register(ebird_data_location)
	ebird_data_id
	}

	count_geese <- function(read_con) {
	open(read_con, 'rb')
	line_batch_size <- 1000
	geese_count <- 0
	total_count <- 0
	has_lines <- TRUE
	while (has_lines) {
	lines_read <- readLines(con = read_con, n = line_batch_size, skipNul = TRUE)
	has_lines <- length(lines_read) > 0
	geese_count_increment <- length(lines_read[grepl("Branta canadensis", lines_read)])
	geese_count <- geese_count + geese_count_increment
	total_count <- total_count + length(lines_read)
	if (total_count %% (line_batch_size * 1000) == 0) {
	cat(".")
	}
	}
	close(read_con)
	cat("done\n")
	list(with_geese=geese_count, total=total_count)
	}

	# complicated algorithm that counts lines
	# with occurrences of Canadian Geese (Branta canadensis)
	# using R's unz (see also https://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html)
	#
	# note that this fails for eBird, because "unz" cuts off the stream
	# after 4GB (https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file)
	#
	count_geese_unz <- function(data_location) {
	read_con <- unz(data_location, "occurrence.txt")
	count_geese(read_con)
	}

	# algorithm that counts lines
	# with occurrences of Canadian Geese (Branta canadensis)
	# using linux unzip (see also https://linux.die.net/man/1/unzip)

	count_geese_linux_unzip <- function(data_location) {
	read_con <- pipe(paste("unzip -p", data_location, "occurrence.txt"))
	count_geese(read_con)
	}

	do_research <- function(data_id, algorithm) {
	# resolve the data_id to current data locations
	data_location <- contentid::resolve(data_id)
	algorithm(data_location)
	}

	reproduce_results <- function(algorithm) {
	ebird_2018 <- do_research("hash://sha256/29d30b566f924355a383b13cd48c3aa239d42cba0a55f4ccfc2930289b88b43c", algorithm = algorithm)

	# assumption from:
	# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt \| grep "Branta canadensis" \| wc -l
	if (geese_count_ebird_2018$with_geese != 4613847) {
	message("failed to reproduce geese count of eBird 2018")
	}
	if (ebird_2018$total != 361429889) {
	message("the number of lines in eBird 2018 do not add up")
	}

	ebird_2019 <- do_research("hash://sha256/ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4", algorithm = algorithm)
	# assumption from:
	# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt \| grep "Branta canadensis" \| wc -l
	if (ebird_2019$with_geese != 7066919) {
	message("failed to reproduce geese count of eBird 2019")
	}

	if (ebird_2019$total != 561852543) {
	message("the number of lines in eBird 2019 do not add up")
	}
	}

	reproduce_all <- function() {
	reproduce_results(algorithm = count_geese_unz)
	reproduce_results(algorithm = count_geese_linux_unzip)
	}