sdtaylor · February 13, 2017 21:52
diff --git a/dplyr_lession.R b/dplyr_lession.R
 library(dplyr)
 library(tidyr)

 shrub_data<-read.csv("~/dplyrLesson/shrub_volume_experiment.csv")

 ########################
 #some basics
 #######################
 #ordering rows!
 #no more using shrub_data=shrub_data[order(shrub_data$site),]
 arrange(shrub_data, site, height)

 #keep rows according to some value
 #similar to subset() or shrub_data[shrub_data$site>1,]
 filter(shrub_data, site>1)

 filter(shrub_data, site>1, experiment=='C')

 #select individual columns 
 #just like shrub_data[,c('length','width')]
 select(shrub_data, length, width)

 #make new columns with calculations
 #the volume of all shrubs?
 mutate(shrub_data, volume=length*width*height)

 #transmute does the same thing, but keeps only the new columns
 transmute(shrub_data, volume=length*width*height)

 #You can calculate any number of values at once
 transmute(shrub_data, volume=length*width*height, cover=length*width)

 #Exercises: 
 #1) Get only shrub data from experiment A.

 #2) Extract all shrubs with heights greater than 4. 

 #######################
 # Tie different operations together using pipes
 ######################

 #Pipes take the output from one thing (either a function or dataframe) and make them the input to another

 shrub_data %>% filter(site>1)

 shrub_data %>% filter(site>1) %>% mutate(volumn=length*width*height)

 #Make them easier to read by putting only one function per line
 shrub_data %>% 
  filter(site>1) %>% 
  mutate(volume=length*width*height)


 #Exercises:
 #1) Calculate the volume of only shrubs with height greater than 4. 

 ######################
 #Now for some actually usefull info. summarize data by statistics
 #####################
 #Calculate the volume, then get the mean and standard deviation from it. 
 shrub_data %>% 
  mutate(volume=length*width*height) %>%
  summarise(volumeMean=mean(volume), volumeSD=sd(volume))


 #But what about calculating statistics for each of our sites?
 #Pipe it through the group_by() function, and specify the column you want to group by
 shrub_data %>% 
  mutate(volume=length*width*height) %>%
  group_by(site) %>%
  summarise(volumeMean=mean(volume), volumeSD=sd(volume), n=n(), se=sd(volume)/sqrt(n()) )


 #Exercise
 # Calculate the mean and standard deviation of height and volume for each experiment group. 






 ######################
 #More complex example using a large dataset
 #####################
 #The portal dataset is from 25 years of monthly rodent trapping on 24 different plots.
 #Each row is a single rodent with information on date, plot number, species, sex, and other characteristics
 rodents<-read.csv("~/dplyrLesson/PortalMammals_main.csv")

 #1st a few more tools using summarise()

 # n() can be used inside summarise() to count the number of samples inside a group. 
 #Example. How many rodents total were caught in each plot in June, 1999
 rodents %>%
  filter(yr==1999, mo==6) %>% #Select only entries from June, 1999
  group_by(plot) %>%  #Use group by to 
  summarise(n=n())

 #Up until now we've had a small number of rows printed to the screen. Now that there are more than 10
 #(the rodent data has 24 plots, so this output has 24 rows), dplyr is trying to be nice with the output
 #and not fill up our console. Lets assign this output to a new dataframe to look at it fully.

 june1999Counts<-rodents %>%
  filter(yr==1999, mo==6) %>% 
  group_by(plot) %>% 
  summarise(n=n())

 #Now for the same period June 1999, I want to know how many of each species was caught. 
 june1999Counts<-rodents %>%
  filter(yr==1999, mo==6) %>% 
  group_by(plot, species) %>% 
  summarise(n=n())


 # n_distinct() counts the number of distinct values in group.
 # How many species were caught at the site in each month in 1999?
 species1999 <- rodents %>%
  filter(yr==1999) %>%
  group_by(mo) %>%
  summarize(numSpecies=n_distinct(species))


 #Exercises
 #1) Whats the average number of rodents caught in each plot in each year?

 #2) Whats the average weight of all mice in each year? (the wgt column is the weight of each rodent in grams)

 #3) Graph the avgerage mouse weight over all the years of the study


 #############################
 # tidyr, another useful tool
 #############################
 #This data is from vegetation transect point counts. Each column is a plot, and 4 transects (each from 1-250dm) were done in each plot.
 #Very messy!
 veg<-read.csv("~/dplyrLesson/transect_data.csv")

 #For easier analysis, we want 1 point per row. As in the following column setup:
 # plot, transect, point, species
 #
 # the tidyr package is made for exactly this.

 cleanVeg=veg %>%
  gather(Plot, SpeciesID, -Transect, -Point)

 #Here, Plot is a new column that will take on values of the old column names. SpeciesID is a new column that will take on
 #values of all the cells. -Transect and -Point tell gather() to keep those columns the same. 

 #tidyr can also be used with dplyr
 #Here is the most common speciesID on each plot.
 mostCommon=veg %>%
  gather(Plot, SpeciesID, -Transect, -Point) %>% 
  group_by(Plot, SpeciesID) %>%
  summarise(n=n()) %>%
  top_n(1, n)

 #This uses the top_n() function. which orders each group and gives the highest values of each, in this case just the highest values.
diff --git a/shrub_volume.csv b/shrub_volume.csv
	library(dplyr)
	library(tidyr)

	shrub_data<-read.csv("~/dplyrLesson/shrub_volume_experiment.csv")

	########################
	#some basics
	#######################
	#ordering rows!
	#no more using shrub_data=shrub_data[order(shrub_data$site),]
	arrange(shrub_data, site, height)

	#keep rows according to some value
	#similar to subset() or shrub_data[shrub_data$site>1,]
	filter(shrub_data, site>1)

	filter(shrub_data, site>1, experiment=='C')

	#select individual columns
	#just like shrub_data[,c('length','width')]
	select(shrub_data, length, width)

	#make new columns with calculations
	#the volume of all shrubs?
	mutate(shrub_data, volume=lengthwidthheight)

	#transmute does the same thing, but keeps only the new columns
	transmute(shrub_data, volume=lengthwidthheight)

	#You can calculate any number of values at once
	transmute(shrub_data, volume=lengthwidthheight, cover=length*width)

	#Exercises:
	#1) Get only shrub data from experiment A.

	#2) Extract all shrubs with heights greater than 4.

	#######################
	# Tie different operations together using pipes
	######################

	#Pipes take the output from one thing (either a function or dataframe) and make them the input to another

	shrub_data %>% filter(site>1)

	shrub_data %>% filter(site>1) %>% mutate(volumn=lengthwidthheight)

	#Make them easier to read by putting only one function per line
	shrub_data %>%
	filter(site>1) %>%
	mutate(volume=lengthwidthheight)


	#Exercises:
	#1) Calculate the volume of only shrubs with height greater than 4.

	######################
	#Now for some actually usefull info. summarize data by statistics
	#####################
	#Calculate the volume, then get the mean and standard deviation from it.
	shrub_data %>%
	mutate(volume=lengthwidthheight) %>%
	summarise(volumeMean=mean(volume), volumeSD=sd(volume))


	#But what about calculating statistics for each of our sites?
	#Pipe it through the group_by() function, and specify the column you want to group by
	shrub_data %>%
	mutate(volume=lengthwidthheight) %>%
	group_by(site) %>%
	summarise(volumeMean=mean(volume), volumeSD=sd(volume), n=n(), se=sd(volume)/sqrt(n()) )


	#Exercise
	# Calculate the mean and standard deviation of height and volume for each experiment group.






	######################
	#More complex example using a large dataset
	#####################
	#The portal dataset is from 25 years of monthly rodent trapping on 24 different plots.
	#Each row is a single rodent with information on date, plot number, species, sex, and other characteristics
	rodents<-read.csv("~/dplyrLesson/PortalMammals_main.csv")

	#1st a few more tools using summarise()

	# n() can be used inside summarise() to count the number of samples inside a group.
	#Example. How many rodents total were caught in each plot in June, 1999
	rodents %>%
	filter(yr==1999, mo==6) %>% #Select only entries from June, 1999
	group_by(plot) %>% #Use group by to
	summarise(n=n())

	#Up until now we've had a small number of rows printed to the screen. Now that there are more than 10
	#(the rodent data has 24 plots, so this output has 24 rows), dplyr is trying to be nice with the output
	#and not fill up our console. Lets assign this output to a new dataframe to look at it fully.

	june1999Counts<-rodents %>%
	filter(yr==1999, mo==6) %>%
	group_by(plot) %>%
	summarise(n=n())

	#Now for the same period June 1999, I want to know how many of each species was caught.
	june1999Counts<-rodents %>%
	filter(yr==1999, mo==6) %>%
	group_by(plot, species) %>%
	summarise(n=n())


	# n_distinct() counts the number of distinct values in group.
	# How many species were caught at the site in each month in 1999?
	species1999 <- rodents %>%
	filter(yr==1999) %>%
	group_by(mo) %>%
	summarize(numSpecies=n_distinct(species))


	#Exercises
	#1) Whats the average number of rodents caught in each plot in each year?

	#2) Whats the average weight of all mice in each year? (the wgt column is the weight of each rodent in grams)

	#3) Graph the avgerage mouse weight over all the years of the study


	#############################
	# tidyr, another useful tool
	#############################
	#This data is from vegetation transect point counts. Each column is a plot, and 4 transects (each from 1-250dm) were done in each plot.
	#Very messy!
	veg<-read.csv("~/dplyrLesson/transect_data.csv")

	#For easier analysis, we want 1 point per row. As in the following column setup:
	# plot, transect, point, species
	#
	# the tidyr package is made for exactly this.

	cleanVeg=veg %>%
	gather(Plot, SpeciesID, -Transect, -Point)

	#Here, Plot is a new column that will take on values of the old column names. SpeciesID is a new column that will take on
	#values of all the cells. -Transect and -Point tell gather() to keep those columns the same.

	#tidyr can also be used with dplyr
	#Here is the most common speciesID on each plot.
	mostCommon=veg %>%
	gather(Plot, SpeciesID, -Transect, -Point) %>%
	group_by(Plot, SpeciesID) %>%
	summarise(n=n()) %>%
	top_n(1, n)

	#This uses the top_n() function. which orders each group and gives the highest values of each, in this case just the highest values.
site	experiment	length	width	height
1	A	2.2	1.3	9.6
1	B	2.1	2.2	7.6
1	C	2.7	1.5	2.2
2	A	3.0	4.5	1.5
2	B	3.1	3.1	4.0
2	C	2.5	2.8	3.0
3	A	1.9	1.8	4.5
3	B	1.1	0.5	2.3
3	C	3.5	2.0	7.5
4	A	2.9	2.7	3.2
4	B	4.5	4.8	6.5
4	C	1.2	1.8	2.7