Last active
March 3, 2021 21:02
-
-
Save joelnitta/68b7212f02a15fee974e51bb9c938a58 to your computer and use it in GitHub Desktop.
Format a dataframe of species and regions for BioGeoBEARS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Write out a geography file in PHYLIP format for BioGeoBears | |
#' | |
#' @param data Dataframe; species distributions in regions, with one row per | |
#' region per species. Must have columns 'species' and 'region' | |
#' @param output_file Path to write outputfile | |
#' | |
#' @author Joel Nitta | |
#' | |
#' @return The the path to `output_file`. | |
#' | |
write_bgb_geo_matrix <- function (data, output_file) { | |
require(magrittr) | |
# Extract region names from data | |
region_names <- data %>% dplyr::pull(region) %>% unique %>% sort | |
# Make sure that worked | |
assertthat::assert_that(length(region_names) > 1, msg = "Need at least two regions") | |
# Make sure column names are "species" and "region" | |
assertthat::assert_that( | |
all(colnames(data) %in% c("species", "region")), | |
msg = "Data must include columns 'species' and 'region'") | |
# Convert input dataframe of regions (one row per region per species) into a dataframe | |
# with one row per species and a column of the combined occurrence in regions as 0101, etc. | |
geo_for_bgb <- | |
data %>% | |
dplyr::select(species, region) %>% | |
# Check for NA in names and regions | |
assertr::assert(assertr::not_na, species, region) %>% | |
# Check for spaces in species names | |
assertr::verify(all(stringr::str_detect(species, " ") == FALSE)) %>% | |
# Check regions are within expected range of values | |
assertr::assert(assertr::in_set(region_names), region) %>% | |
assertr::verify(all(region_names %in% region)) %>% | |
# Use only unique combinations of species x regions | |
unique() %>% | |
dplyr::mutate(abundance = 1) %>% | |
tidyr::pivot_wider(names_from = region, values_from = abundance, values_fill = list(abundance = 0)) %>% | |
# Rearrange regions into alphabetical order | |
dplyr::select(species, dplyr::all_of(region_names)) %>% | |
# Smash region occurrences into a single string per species (e.g., 0101) | |
tidyr::unite(combined, dplyr::all_of(region_names), sep = "", remove = FALSE) %>% | |
dplyr::select(species, combined) %>% | |
# Make sure each instance of "combined" has exactly the number of characters (regions) expected | |
assertr::assert(function(x) nchar(x) == length(region_names), combined) | |
# Get some summary numbers for PHYLIP header: number of species, number of regions, names of regions | |
n_sp <- nrow(geo_for_bgb) | |
n_areas <- length(region_names) | |
areas <- paste(region_names, collapse = " ") | |
# Format matrix for output in BioGeoBears format. | |
# | |
# Nick's comments: | |
# --- | |
# Geography file | |
# Notes: | |
# 1. This is a PHYLIP-formatted file. This means that in the | |
# first line, | |
# - the 1st number equals the number of rows (species) | |
# - the 2nd number equals the number of columns (number of areas) | |
# - after a tab, put the areas in parentheses, with spaces: (A B C D) | |
# | |
# 1.5. Example first line: | |
# 10 4 (A B C D) | |
# | |
# 2. The second line, and subsequent lines: | |
# speciesA 0110 | |
# speciesB 0111 | |
# speciesC 0001 | |
# ... | |
# | |
# 2.5a. This means a TAB between the species name and the area 0/1s | |
# 2.5b. This also means NO SPACE AND NO TAB between the area 0/1s. | |
# | |
# 3. See example files at: | |
# http://phylo.wikidot.com/biogeobears#files | |
# | |
# --- | |
# | |
# (Note that Nick's comments say to separate number of areas and names of areas | |
# with a tab, but it is a space in his example data. Here, use tab.) | |
line_1 <- glue::glue("{n_sp}\t{n_areas}\t({areas})") %>% as.character() | |
rest <- glue::glue("{geo_for_bgb$species}\t{geo_for_bgb$combined}") %>% as.character() | |
geo_for_bgb_formatted <- c(line_1, rest) | |
readr::write_lines(geo_for_bgb_formatted, output_file) | |
output_file | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment