Created
February 13, 2017 21:52
-
-
Save sdtaylor/c03cc5f891a96058cf98c088bca32f45 to your computer and use it in GitHub Desktop.
R Workshop dplyr() lesson
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
shrub_data<-read.csv("~/dplyrLesson/shrub_volume_experiment.csv") | |
######################## | |
#some basics | |
####################### | |
#ordering rows! | |
#no more using shrub_data=shrub_data[order(shrub_data$site),] | |
arrange(shrub_data, site, height) | |
#keep rows according to some value | |
#similar to subset() or shrub_data[shrub_data$site>1,] | |
filter(shrub_data, site>1) | |
filter(shrub_data, site>1, experiment=='C') | |
#select individual columns | |
#just like shrub_data[,c('length','width')] | |
select(shrub_data, length, width) | |
#make new columns with calculations | |
#the volume of all shrubs? | |
mutate(shrub_data, volume=length*width*height) | |
#transmute does the same thing, but keeps only the new columns | |
transmute(shrub_data, volume=length*width*height) | |
#You can calculate any number of values at once | |
transmute(shrub_data, volume=length*width*height, cover=length*width) | |
#Exercises: | |
#1) Get only shrub data from experiment A. | |
#2) Extract all shrubs with heights greater than 4. | |
####################### | |
# Tie different operations together using pipes | |
###################### | |
#Pipes take the output from one thing (either a function or dataframe) and make them the input to another | |
shrub_data %>% filter(site>1) | |
shrub_data %>% filter(site>1) %>% mutate(volumn=length*width*height) | |
#Make them easier to read by putting only one function per line | |
shrub_data %>% | |
filter(site>1) %>% | |
mutate(volume=length*width*height) | |
#Exercises: | |
#1) Calculate the volume of only shrubs with height greater than 4. | |
###################### | |
#Now for some actually usefull info. summarize data by statistics | |
##################### | |
#Calculate the volume, then get the mean and standard deviation from it. | |
shrub_data %>% | |
mutate(volume=length*width*height) %>% | |
summarise(volumeMean=mean(volume), volumeSD=sd(volume)) | |
#But what about calculating statistics for each of our sites? | |
#Pipe it through the group_by() function, and specify the column you want to group by | |
shrub_data %>% | |
mutate(volume=length*width*height) %>% | |
group_by(site) %>% | |
summarise(volumeMean=mean(volume), volumeSD=sd(volume), n=n(), se=sd(volume)/sqrt(n()) ) | |
#Exercise | |
# Calculate the mean and standard deviation of height and volume for each experiment group. | |
###################### | |
#More complex example using a large dataset | |
##################### | |
#The portal dataset is from 25 years of monthly rodent trapping on 24 different plots. | |
#Each row is a single rodent with information on date, plot number, species, sex, and other characteristics | |
rodents<-read.csv("~/dplyrLesson/PortalMammals_main.csv") | |
#1st a few more tools using summarise() | |
# n() can be used inside summarise() to count the number of samples inside a group. | |
#Example. How many rodents total were caught in each plot in June, 1999 | |
rodents %>% | |
filter(yr==1999, mo==6) %>% #Select only entries from June, 1999 | |
group_by(plot) %>% #Use group by to | |
summarise(n=n()) | |
#Up until now we've had a small number of rows printed to the screen. Now that there are more than 10 | |
#(the rodent data has 24 plots, so this output has 24 rows), dplyr is trying to be nice with the output | |
#and not fill up our console. Lets assign this output to a new dataframe to look at it fully. | |
june1999Counts<-rodents %>% | |
filter(yr==1999, mo==6) %>% | |
group_by(plot) %>% | |
summarise(n=n()) | |
#Now for the same period June 1999, I want to know how many of each species was caught. | |
june1999Counts<-rodents %>% | |
filter(yr==1999, mo==6) %>% | |
group_by(plot, species) %>% | |
summarise(n=n()) | |
# n_distinct() counts the number of distinct values in group. | |
# How many species were caught at the site in each month in 1999? | |
species1999 <- rodents %>% | |
filter(yr==1999) %>% | |
group_by(mo) %>% | |
summarize(numSpecies=n_distinct(species)) | |
#Exercises | |
#1) Whats the average number of rodents caught in each plot in each year? | |
#2) Whats the average weight of all mice in each year? (the wgt column is the weight of each rodent in grams) | |
#3) Graph the avgerage mouse weight over all the years of the study | |
############################# | |
# tidyr, another useful tool | |
############################# | |
#This data is from vegetation transect point counts. Each column is a plot, and 4 transects (each from 1-250dm) were done in each plot. | |
#Very messy! | |
veg<-read.csv("~/dplyrLesson/transect_data.csv") | |
#For easier analysis, we want 1 point per row. As in the following column setup: | |
# plot, transect, point, species | |
# | |
# the tidyr package is made for exactly this. | |
cleanVeg=veg %>% | |
gather(Plot, SpeciesID, -Transect, -Point) | |
#Here, Plot is a new column that will take on values of the old column names. SpeciesID is a new column that will take on | |
#values of all the cells. -Transect and -Point tell gather() to keep those columns the same. | |
#tidyr can also be used with dplyr | |
#Here is the most common speciesID on each plot. | |
mostCommon=veg %>% | |
gather(Plot, SpeciesID, -Transect, -Point) %>% | |
group_by(Plot, SpeciesID) %>% | |
summarise(n=n()) %>% | |
top_n(1, n) | |
#This uses the top_n() function. which orders each group and gives the highest values of each, in this case just the highest values. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
site | experiment | length | width | height | |
---|---|---|---|---|---|
1 | A | 2.2 | 1.3 | 9.6 | |
1 | B | 2.1 | 2.2 | 7.6 | |
1 | C | 2.7 | 1.5 | 2.2 | |
2 | A | 3.0 | 4.5 | 1.5 | |
2 | B | 3.1 | 3.1 | 4.0 | |
2 | C | 2.5 | 2.8 | 3.0 | |
3 | A | 1.9 | 1.8 | 4.5 | |
3 | B | 1.1 | 0.5 | 2.3 | |
3 | C | 3.5 | 2.0 | 7.5 | |
4 | A | 2.9 | 2.7 | 3.2 | |
4 | B | 4.5 | 4.8 | 6.5 | |
4 | C | 1.2 | 1.8 | 2.7 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment