aammd · May 18, 2020 15:33
diff --git a/example_data_manipulation.Rmd b/example_data_manipulation.Rmd
 ---
 title: "Exploring observations"
 author: "Andrew, based on Tim's lecture!"
 date: "18/05/2020"
 output: html_document
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```


 ```{r how_many}
 # andrew -- gonna look at the data today
 # 18 may 2020

 library(tidyverse)

 obs <- readr::read_csv("observations.csv")

 glimpse(obs)

 # are they all bats??

 unique(obs$order)

 # yup

 unique(obs$taxon)

 ## wait, how many are these? 

 obs %>% 
  mutate(is_bold = str_detect(taxon, "BOLD")) %>% 
  filter(is_bold) %>% as.data.frame
 ```


 Ok so there are not that many observations which are `BOLD`, which apparently means something genetic

 ```{r}
 obs %>% 
  filter(rank == "SPECIES")
 ```

 ### Question 1 -- what is the earliest and latest observation of each species?


 In the style of the tidyverse, the `where` keyword corresponds to `filter()`:
 ```{r}
 obs %>% 
  filter(rank == "SPECIES",
         !is.na(date))
 ```

 Note that we can filter by two conditions at once

 ```{r}
 taxon_first_last <- obs %>% 
  filter(rank == "SPECIES",
         !is.na(date)) %>% 
  select(taxon, date) %>% 
  group_by(taxon) %>% 
  summarize(earliest = min(date), latest = max(date))
 ```

 let's simplify the process of working with the dates, using `lubridate::year`

 ```{r}
 taxon_first_last %>% 
  mutate(earliest = lubridate::year(earliest), latest = lubridate::year(latest))
 ```

 alternatively:

 ```{r}
 taxon_first_last %>% 
  mutate_if(lubridate::is.Date, as.character)
 ```

 uh well that doesn't seem to work

 ### Question 2 -- what is the range of latitudes for which we see certain genera

 ```{r}
 genus_lat_extremes <- obs %>% 
  select(genus, latitude) %>% 
  filter(complete.cases(.)) %>% 
  # equivalently, and more similar to what we did in class:
  # filter(!is.na(genus), !is.na(latitude))
  group_by(genus) %>% 
  summarise(north = max(latitude), south = min(latitude))
 ```

 Now we want to find the range of latitude for each genus

 * the operation `transform` corresponds to mutate 
 * if you want to keep the original variables use `mutate()`. If you want to discard those variables, use `transmute()`

 ```{r}
 genus_lat_extremes %>% 
  mutate(span = north - south) %>% 
  arrange(span)
 ```

 Can we add sample sizes? 

 ```{r}
 obs %>% 
  select(genus, latitude) %>% 
  filter(complete.cases(.)) %>% 
  add_count(genus) %>% 
  filter(n > 1) %>% 
  group_by(genus) %>% 
  summarise(north = max(latitude), south = min(latitude)) %>% 
  mutate(span = north - south) %>% 
  arrange(span)
 ```

 ## make a simple "timeline" plot

 When do species appear and disappear? 
 ```{r}
 taxon_first_last %>% 
  mutate(earliest = lubridate::year(earliest),
         latest = lubridate::year(latest),
         duration = latest - earliest,
         taxon = forcats::fct_reorder(taxon, duration)) %>% 
  ggplot(aes(x = earliest, xend = latest, y = taxon, yend = taxon)) + geom_segment()
 ```

 ## joining

 ```{r}
 taxon_first_last_yr <- taxon_first_last %>% 
  mutate(earliest = lubridate::year(earliest),
         latest = lubridate::year(latest))

 taxon_first_last_yr %>% head %>% knitr::kable(.)
 ```

 Can we add taxonomic information back in?

 ```{r}
 taxonomic_info <- obs %>% 
  select(taxon, order, family, genus) %>% 
  distinct

 taxonomic_info %>% head %>% knitr::kable(.)
 ```

 Now join them together

 ```{r}
 taxon_first_last_yr %>% 
  filter(earliest > 1990) %>% 
  left_join(taxonomic_info, by = "taxon")
 ```

 ### observers per species

 ```{r}
 obs %>% 
  filter(rank == "SPECIES",
         !is.na(observer),
         observer != "Unknown") %>% 
  group_by(taxon) %>% 
  summarize(observers = length(unique(observer)),
            observations = n()) %>% 
  arrange(observations) %>% 
  left_join(taxonomic_info) %>% 
  ggplot(aes(x = observers, y = observations, colour = family)) + 
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10()
 ```

 ## stacking and a density plot

 ```{r}
 taxon_first_last_yr %>% 
  pivot_longer(-taxon) %>% 
  ggplot(aes(x = value, colour= name)) + geom_density()
 ```
	---
	title: "Exploring observations"
	author: "Andrew, based on Tim's lecture!"
	date: "18/05/2020"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```


	```{r how_many}
	# andrew -- gonna look at the data today
	# 18 may 2020

	library(tidyverse)

	obs <- readr::read_csv("observations.csv")

	glimpse(obs)

	# are they all bats??

	unique(obs$order)

	# yup

	unique(obs$taxon)

	## wait, how many are these?

	obs %>%
	mutate(is_bold = str_detect(taxon, "BOLD")) %>%
	filter(is_bold) %>% as.data.frame
	```


	Ok so there are not that many observations which are `BOLD`, which apparently means something genetic

	```{r}
	obs %>%
	filter(rank == "SPECIES")
	```

	### Question 1 -- what is the earliest and latest observation of each species?


	In the style of the tidyverse, the `where` keyword corresponds to `filter()`:
	```{r}
	obs %>%
	filter(rank == "SPECIES",
	!is.na(date))
	```

	Note that we can filter by two conditions at once

	```{r}
	taxon_first_last <- obs %>%
	filter(rank == "SPECIES",
	!is.na(date)) %>%
	select(taxon, date) %>%
	group_by(taxon) %>%
	summarize(earliest = min(date), latest = max(date))
	```

	let's simplify the process of working with the dates, using `lubridate::year`

	```{r}
	taxon_first_last %>%
	mutate(earliest = lubridate::year(earliest), latest = lubridate::year(latest))
	```

	alternatively:

	```{r}
	taxon_first_last %>%
	mutate_if(lubridate::is.Date, as.character)
	```

	uh well that doesn't seem to work

	### Question 2 -- what is the range of latitudes for which we see certain genera

	```{r}
	genus_lat_extremes <- obs %>%
	select(genus, latitude) %>%
	filter(complete.cases(.)) %>%
	# equivalently, and more similar to what we did in class:
	# filter(!is.na(genus), !is.na(latitude))
	group_by(genus) %>%
	summarise(north = max(latitude), south = min(latitude))
	```

	Now we want to find the range of latitude for each genus

	* the operation `transform` corresponds to mutate
	* if you want to keep the original variables use `mutate()`. If you want to discard those variables, use `transmute()`

	```{r}
	genus_lat_extremes %>%
	mutate(span = north - south) %>%
	arrange(span)
	```

	Can we add sample sizes?

	```{r}
	obs %>%
	select(genus, latitude) %>%
	filter(complete.cases(.)) %>%
	add_count(genus) %>%
	filter(n > 1) %>%
	group_by(genus) %>%
	summarise(north = max(latitude), south = min(latitude)) %>%
	mutate(span = north - south) %>%
	arrange(span)
	```

	## make a simple "timeline" plot

	When do species appear and disappear?
	```{r}
	taxon_first_last %>%
	mutate(earliest = lubridate::year(earliest),
	latest = lubridate::year(latest),
	duration = latest - earliest,
	taxon = forcats::fct_reorder(taxon, duration)) %>%
	ggplot(aes(x = earliest, xend = latest, y = taxon, yend = taxon)) + geom_segment()
	```

	## joining

	```{r}
	taxon_first_last_yr <- taxon_first_last %>%
	mutate(earliest = lubridate::year(earliest),
	latest = lubridate::year(latest))

	taxon_first_last_yr %>% head %>% knitr::kable(.)
	```

	Can we add taxonomic information back in?

	```{r}
	taxonomic_info <- obs %>%
	select(taxon, order, family, genus) %>%
	distinct

	taxonomic_info %>% head %>% knitr::kable(.)
	```

	Now join them together

	```{r}
	taxon_first_last_yr %>%
	filter(earliest > 1990) %>%
	left_join(taxonomic_info, by = "taxon")
	```

	### observers per species

	```{r}
	obs %>%
	filter(rank == "SPECIES",
	!is.na(observer),
	observer != "Unknown") %>%
	group_by(taxon) %>%
	summarize(observers = length(unique(observer)),
	observations = n()) %>%
	arrange(observations) %>%
	left_join(taxonomic_info) %>%
	ggplot(aes(x = observers, y = observations, colour = family)) +
	geom_point() +
	scale_x_log10() +
	scale_y_log10()
	```

	## stacking and a density plot

	```{r}
	taxon_first_last_yr %>%
	pivot_longer(-taxon) %>%
	ggplot(aes(x = value, colour= name)) + geom_density()
	```