joelnitta · March 3, 2021 21:02
diff --git a/write_bgb_geo_matrix.R b/write_bgb_geo_matrix.R
 #' Write out a geography file in PHYLIP format for BioGeoBears
 #'
 #' @param data Dataframe; species distributions in regions, with one row per 
 #' region per species. Must have columns 'species' and 'region'
 #' @param output_file Path to write outputfile
 #'
 #' @author Joel Nitta
 #' 
 #' @return The the path to `output_file`.
 #' 
 write_bgb_geo_matrix <- function (data, output_file) {
  
  require(magrittr)
  
  # Extract region names from data
  region_names <- data %>% dplyr::pull(region) %>% unique %>% sort
  
  # Make sure that worked
  assertthat::assert_that(length(region_names) > 1, msg = "Need at least two regions")
  
  # Make sure column names are "species" and "region"
  assertthat::assert_that(
    all(colnames(data) %in% c("species", "region")), 
    msg = "Data must include columns 'species' and 'region'")
  
  # Convert input dataframe of regions (one row per region per species) into a dataframe
  # with one row per species and a column of the combined occurrence in regions as 0101, etc.
  geo_for_bgb <- 
    data %>%
    dplyr::select(species, region) %>%
    # Check for NA in names and regions
    assertr::assert(assertr::not_na, species, region) %>%
    # Check for spaces in species names
    assertr::verify(all(stringr::str_detect(species, " ") == FALSE)) %>%
    # Check regions are within expected range of values
    assertr::assert(assertr::in_set(region_names), region) %>%
    assertr::verify(all(region_names %in% region)) %>%
    # Use only unique combinations of species x regions
    unique() %>%
    dplyr::mutate(abundance = 1) %>%
    tidyr::pivot_wider(names_from = region, values_from = abundance, values_fill = list(abundance = 0)) %>%
    # Rearrange regions into alphabetical order
    dplyr::select(species, dplyr::all_of(region_names)) %>%
    # Smash region occurrences into a single string per species (e.g., 0101)
    tidyr::unite(combined, dplyr::all_of(region_names), sep = "", remove = FALSE) %>%
    dplyr::select(species, combined) %>%
    # Make sure each instance of "combined" has exactly the number of characters (regions) expected
    assertr::assert(function(x) nchar(x) == length(region_names), combined)
  
  # Get some summary numbers for PHYLIP header: number of species, number of regions, names of regions
  n_sp <- nrow(geo_for_bgb)
  n_areas <- length(region_names)
  areas <- paste(region_names, collapse = " ")
  
  # Format matrix for output in BioGeoBears format.
  #
  # Nick's comments:
  # ---
  # Geography file
  # Notes:
  # 1. This is a PHYLIP-formatted file. This means that in the 
  #    first line, 
  #    - the 1st number equals the number of rows (species)
  #    - the 2nd number equals the number of columns (number of areas)
  #    - after a tab, put the areas in parentheses, with spaces: (A B C D)
  #
  # 1.5. Example first line:
  #    10    4    (A B C D)
  # 
  # 2. The second line, and subsequent lines:
  #    speciesA    0110
  #    speciesB    0111
  #    speciesC    0001
  #         ...
  # 
  # 2.5a. This means a TAB between the species name and the area 0/1s
  # 2.5b. This also means NO SPACE AND NO TAB between the area 0/1s.
  # 
  # 3. See example files at:
  #    http://phylo.wikidot.com/biogeobears#files
  #
  # ---
  #
  # (Note that Nick's comments say to separate number of areas and names of areas
  # with a tab, but it is a space in his example data. Here, use tab.)
  line_1 <- glue::glue("{n_sp}\t{n_areas}\t({areas})") %>% as.character()
  rest <- glue::glue("{geo_for_bgb$species}\t{geo_for_bgb$combined}") %>% as.character()
  
  geo_for_bgb_formatted <- c(line_1, rest)
  
  readr::write_lines(geo_for_bgb_formatted, output_file)
  
  output_file
  
 }
	#' Write out a geography file in PHYLIP format for BioGeoBears
	#'
	#' @param data Dataframe; species distributions in regions, with one row per
	#' region per species. Must have columns 'species' and 'region'
	#' @param output_file Path to write outputfile
	#'
	#' @author Joel Nitta
	#'
	#' @return The the path to `output_file`.
	#'
	write_bgb_geo_matrix <- function (data, output_file) {

	require(magrittr)

	# Extract region names from data
	region_names <- data %>% dplyr::pull(region) %>% unique %>% sort

	# Make sure that worked
	assertthat::assert_that(length(region_names) > 1, msg = "Need at least two regions")

	# Make sure column names are "species" and "region"
	assertthat::assert_that(
	all(colnames(data) %in% c("species", "region")),
	msg = "Data must include columns 'species' and 'region'")

	# Convert input dataframe of regions (one row per region per species) into a dataframe
	# with one row per species and a column of the combined occurrence in regions as 0101, etc.
	geo_for_bgb <-
	data %>%
	dplyr::select(species, region) %>%
	# Check for NA in names and regions
	assertr::assert(assertr::not_na, species, region) %>%
	# Check for spaces in species names
	assertr::verify(all(stringr::str_detect(species, " ") == FALSE)) %>%
	# Check regions are within expected range of values
	assertr::assert(assertr::in_set(region_names), region) %>%
	assertr::verify(all(region_names %in% region)) %>%
	# Use only unique combinations of species x regions
	unique() %>%
	dplyr::mutate(abundance = 1) %>%
	tidyr::pivot_wider(names_from = region, values_from = abundance, values_fill = list(abundance = 0)) %>%
	# Rearrange regions into alphabetical order
	dplyr::select(species, dplyr::all_of(region_names)) %>%
	# Smash region occurrences into a single string per species (e.g., 0101)
	tidyr::unite(combined, dplyr::all_of(region_names), sep = "", remove = FALSE) %>%
	dplyr::select(species, combined) %>%
	# Make sure each instance of "combined" has exactly the number of characters (regions) expected
	assertr::assert(function(x) nchar(x) == length(region_names), combined)

	# Get some summary numbers for PHYLIP header: number of species, number of regions, names of regions
	n_sp <- nrow(geo_for_bgb)
	n_areas <- length(region_names)
	areas <- paste(region_names, collapse = " ")

	# Format matrix for output in BioGeoBears format.
	#
	# Nick's comments:
	# ---
	# Geography file
	# Notes:
	# 1. This is a PHYLIP-formatted file. This means that in the
	# first line,
	# - the 1st number equals the number of rows (species)
	# - the 2nd number equals the number of columns (number of areas)
	# - after a tab, put the areas in parentheses, with spaces: (A B C D)
	#
	# 1.5. Example first line:
	# 10 4 (A B C D)
	#
	# 2. The second line, and subsequent lines:
	# speciesA 0110
	# speciesB 0111
	# speciesC 0001
	# ...
	#
	# 2.5a. This means a TAB between the species name and the area 0/1s
	# 2.5b. This also means NO SPACE AND NO TAB between the area 0/1s.
	#
	# 3. See example files at:
	# http://phylo.wikidot.com/biogeobears#files
	#
	# ---
	#
	# (Note that Nick's comments say to separate number of areas and names of areas
	# with a tab, but it is a space in his example data. Here, use tab.)
	line_1 <- glue::glue("{n_sp}\t{n_areas}\t({areas})") %>% as.character()
	rest <- glue::glue("{geo_for_bgb$species}\t{geo_for_bgb$combined}") %>% as.character()

	geo_for_bgb_formatted <- c(line_1, rest)

	readr::write_lines(geo_for_bgb_formatted, output_file)

	output_file

	}