Skip to content

Instantly share code, notes, and snippets.

@joelnitta
Last active March 3, 2021 21:02
Show Gist options
  • Save joelnitta/68b7212f02a15fee974e51bb9c938a58 to your computer and use it in GitHub Desktop.
Save joelnitta/68b7212f02a15fee974e51bb9c938a58 to your computer and use it in GitHub Desktop.
Format a dataframe of species and regions for BioGeoBEARS
#' Write out a geography file in PHYLIP format for BioGeoBears
#'
#' @param data Dataframe; species distributions in regions, with one row per
#' region per species. Must have columns 'species' and 'region'
#' @param output_file Path to write outputfile
#'
#' @author Joel Nitta
#'
#' @return The the path to `output_file`.
#'
write_bgb_geo_matrix <- function (data, output_file) {
require(magrittr)
# Extract region names from data
region_names <- data %>% dplyr::pull(region) %>% unique %>% sort
# Make sure that worked
assertthat::assert_that(length(region_names) > 1, msg = "Need at least two regions")
# Make sure column names are "species" and "region"
assertthat::assert_that(
all(colnames(data) %in% c("species", "region")),
msg = "Data must include columns 'species' and 'region'")
# Convert input dataframe of regions (one row per region per species) into a dataframe
# with one row per species and a column of the combined occurrence in regions as 0101, etc.
geo_for_bgb <-
data %>%
dplyr::select(species, region) %>%
# Check for NA in names and regions
assertr::assert(assertr::not_na, species, region) %>%
# Check for spaces in species names
assertr::verify(all(stringr::str_detect(species, " ") == FALSE)) %>%
# Check regions are within expected range of values
assertr::assert(assertr::in_set(region_names), region) %>%
assertr::verify(all(region_names %in% region)) %>%
# Use only unique combinations of species x regions
unique() %>%
dplyr::mutate(abundance = 1) %>%
tidyr::pivot_wider(names_from = region, values_from = abundance, values_fill = list(abundance = 0)) %>%
# Rearrange regions into alphabetical order
dplyr::select(species, dplyr::all_of(region_names)) %>%
# Smash region occurrences into a single string per species (e.g., 0101)
tidyr::unite(combined, dplyr::all_of(region_names), sep = "", remove = FALSE) %>%
dplyr::select(species, combined) %>%
# Make sure each instance of "combined" has exactly the number of characters (regions) expected
assertr::assert(function(x) nchar(x) == length(region_names), combined)
# Get some summary numbers for PHYLIP header: number of species, number of regions, names of regions
n_sp <- nrow(geo_for_bgb)
n_areas <- length(region_names)
areas <- paste(region_names, collapse = " ")
# Format matrix for output in BioGeoBears format.
#
# Nick's comments:
# ---
# Geography file
# Notes:
# 1. This is a PHYLIP-formatted file. This means that in the
# first line,
# - the 1st number equals the number of rows (species)
# - the 2nd number equals the number of columns (number of areas)
# - after a tab, put the areas in parentheses, with spaces: (A B C D)
#
# 1.5. Example first line:
# 10 4 (A B C D)
#
# 2. The second line, and subsequent lines:
# speciesA 0110
# speciesB 0111
# speciesC 0001
# ...
#
# 2.5a. This means a TAB between the species name and the area 0/1s
# 2.5b. This also means NO SPACE AND NO TAB between the area 0/1s.
#
# 3. See example files at:
# http://phylo.wikidot.com/biogeobears#files
#
# ---
#
# (Note that Nick's comments say to separate number of areas and names of areas
# with a tab, but it is a space in his example data. Here, use tab.)
line_1 <- glue::glue("{n_sp}\t{n_areas}\t({areas})") %>% as.character()
rest <- glue::glue("{geo_for_bgb$species}\t{geo_for_bgb$combined}") %>% as.character()
geo_for_bgb_formatted <- c(line_1, rest)
readr::write_lines(geo_for_bgb_formatted, output_file)
output_file
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment