jstaf · March 8, 2015 02:09
diff --git a/biomaRt_tutorial.R b/biomaRt_tutorial.R
 # biomaRt is a data mining tool that allows you extract massive amounts of data
 # from online databases with minimal effort.

 # This tutorial will use biomaRt to find the average distance between 
 # transcription factor binding sites and the genes they are known to act on in 
 # Drosophila (let's say we want to know how large an area we need to search 
 # up/down from a gene while looking for new binding sites). Interestingly, I
 # don't think there's actually a published value for this, so we're doing
 # something new here.

 # Jeff Stafford

 # Load our starting dataset.
 known_enhancers <- read.delim(file= "oreganno_dmel_full.txt", as.is = TRUE)

 # This file was generated from oreganno_FULL_08Nov10.txt.gz, retrieved from 
 # http://www.oreganno.org/oregano/Dump.jsp. Oreganno is a free and open
 # regulatory element database. This is literally all of the Drosophila-related
 # information in the database.
 str(known_enhancers)

 # Bash commands used to extract the Drosophila genes and generate the above file:
 # gunzip oreganno_FULL_08Nov10.txt.gz
 # grep -i 'drosophila melanogaster' > oreganno_dmel_data.txt 
 # head -n 1 oreganno_FULL_08Nov10.txt > oreganno_dmel_full.txt
 # cat oreganno_dmel_data.txt >> oreganno_dmel_full.txt 

 # We are going to try to calculate the average distance between transcription
 # factors and their targets. To do this, we need the start and end positions of
 # each target.

 # Our database already has the enhancer locations and targets.
 head(cbind(known_enhancers$Gene.name, known_enhancers$chromStart))

 # Now, let's try and retrieve all the information that would normally be in a
 # .bed file for each gene (the gene start and end). 

 library(biomaRt)

 # We can browse a list of "marts" to use, generally each one is maintained by a
 # separate organization. We're going to use ensembl, as it generally has the
 # most variety in terms of species.
 martList <- listMarts() 
 head(martList)
 ensembl <- useMart("ensembl")

 # Which datasets do we want to use?
 datasets <- listDatasets(ensembl)
 head(datasets)
 ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl")
 # As an important side note, the Oreganno data and ensembl mart are both using 
 # the BDGP5 annotation (the current annotation is BDGP6). A lot of things
 # changed between these two versions, so using mismatched annotations would be BAD.

 # What data do we want to retrieve?
 attributes <- listAttributes(ensembl)
 head(attributes)

 # In this case, we want to grab several different versions of each gene name,
 # chromosome each gene is on, and the start and end positions for each.
 bedmap <- getBM(attributes = c("ensembl_gene_id", # FBgn number - changes often, but is highly specific
                               "flybasename_gene", # Actual gene names
                               "flybasecgid_gene", # CG number - similar to FBgn, but not all datasets have this.
                               "chromosome_name",
                               "start_position",
                               "end_position"),
                filters = "flybasename_gene", # "filters" is the category of values we are searching by
                values = known_enhancers$Gene.name, # "values" are the actual values we are searching with
                mart = ensembl) # Need to specify the mart/dataset we are searching.

 # If something goes wrong, or if Ensembl ever updates its mart to BDGP6, we can
 # also load the data this way:
 # bedmap <- read.csv("TFbedmap.csv", as.is = TRUE)

 # Check to make sure we got an annotation for every gene known to be associated
 # with a transcription factor.
 length(unique(known_enhancers$Gene.ID)) == length(unique(bedmap$flybasename_gene))

 # Upon closer inspection of our TF dataset, much of the mismatch is due to the
 # fact that not all of our binding sites are actually binding sites. The dataset
 # includes regulatory regions not associated with any gene.
 unique(known_enhancers$Type)
 sub <- subset(known_enhancers, known_enhancers$Type == "TRANSCRIPTION FACTOR BINDING SITE")
 sum(is.na(match(bedmap$flybasecgid_gene, sub$Gene.ID)))/length(sub$Gene.ID)
 # All in all, it looks like we are missing about gene start/end information for 
 # 11 percent of our genes. This is likely due to an annotation mismatch that I
 # don't particularly feel like troubleshooting at the moment.

 # Match up and annotate our enhancer db with what we know about their target genes.
 tf_idx <- match(known_enhancers$Gene.ID, bedmap$flybasecgid_gene)

 # With this index, we can simply add in our new data from ensembl.
 known_enhancers$gene_start <- bedmap$start_position[tf_idx]
 known_enhancers$gene_end <- bedmap$end_position[tf_idx]

 # Okay so now we know where all of the genes in our TF database start. Lets
 # calculate distances.
 removeNegative <- function(bp) {
  if (is.na(bp)) {
    NA
  }
  else if (bp<0) {
    NA
  } else {
    bp
  }
 }

 known_enhancers$dist_from_start <- known_enhancers$chromStart - known_enhancers$gene_start
 known_enhancers$dist_from_start <- apply(as.matrix(known_enhancers$dist_from_start),1,removeNegative)
 known_enhancers$dist_from_end <- known_enhancers$chromStart - known_enhancers$gene_end
 known_enhancers$dist_from_end <- apply(as.matrix(known_enhancers$dist_from_end),1,removeNegative)

 mean(known_enhancers$dist_from_start, na.rm = TRUE)
 mean(known_enhancers$dist_from_end, na.rm = TRUE)

 range(known_enhancers$dist_from_start, na.rm = TRUE)
 range(known_enhancers$dist_from_end, na.rm = TRUE)

 # That's huge, but it looks like there two TFs with absurdly distant enhancers... 
 # I'm going to arbitrarily exclude outliers over 50kb as a result (the stuff
 # we'd probably want to do for downstream analysis won't even go that far
 # anyways).

 start_subset <- subset(known_enhancers, known_enhancers$dist_from_start < 50000)
 hist(start_subset$dist_from_start, breaks = seq(0,50000,1000))
 end_subset <- subset(known_enhancers, known_enhancers$dist_from_end < 50000)
 hist(end_subset$dist_from_end, breaks = seq(0,50000,1000))

 start_mean <- mean(start_subset$dist_from_start, na.rm = TRUE)
 start_sd <- sd(start_subset$dist_from_start, na.rm = TRUE)

 end_mean <- mean(end_subset$dist_from_end, na.rm = TRUE)
 end_sd <- sd(end_subset$dist_from_end, na.rm = TRUE)

 start_mean + start_sd
 end_mean + end_sd
 # It looks like enhancers are generally within 26.5kb of genes in Drosophila. 

 length(start_subset$dist_from_start)/length(end_subset$dist_from_end)
 # There are twice as many known enhancers upstream of genes compared to downstream.

 # Again, these are very rough estimates and should be taken with a MASSIVE grain
 # of salt (this is absurdly biased towards genes/enhancers that have been
 # studied more). But hey, we answered our initial question and learned to use
 # biomaRt, right?

diff --git a/ggplot2_tutorial.R b/ggplot2_tutorial.R
 # ggplot2 is a plotting framework that is relatively easy to use, powerful, AND
 # it looks good.

 # Jeff Stafford

 library(ggplot2)

 # Here's the data we're going to use (bundled with ggplot2).
 str(msleep)
 # Looks like it's data on sleep of some kind.
 data <- msleep

 # What is a ggplot2 object? Basically it is your data + information on how to
 # interpret it + the actual geometry it uses to plot it.

 # How to create ggplot2 objects:

 # You can add as much data in the inital function call as you want. All of these
 # work, but the final version is the only "complete" object that fully specifies
 # the data used for the plot. 
 ref <- ggplot()
 ref <- ggplot(data)
 ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

 # To store an object (to add to it later/plot it on demand), just give it a
 # reference. Simply typing the reference will display the plot (if you've
 # provided enough information to make it.)
 ref
 # As you can see, we haven't specified everything we need yet.

 # IMPORTANT: There are 3 components to making a plot with a ggplot object: your
 # data, the aesthetic mappings of your data, and the geometry. If you are
 # missing one, you won't get a functional plot.

 # Your data should be a dataframe with everything you want to plot. Note that it
 # is possible to put data from multiple sources (ie. different dataframes) in 
 # the same plot, but it's easier if everything is in the same 2-dimensional
 # dataframe.
 ref <- ggplot(data)

 # The aesthetic mappings tell ggplot2 how to interpret your data. Which values
 # in your dataframe are the y-values, x-values, what should be used for colors, etc.
 ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

 # The geometry is the actual stuff that goes on the plot. You can specify any 
 # geometry as long as you have supplied the values it needs. If you've specified
 # the required aesthetic mappings (which data corresponds to x, y, etc.), all
 # you need to do is tell ggplot2 to create a certain geometry- for instance a
 # scatterplot.

 # Just add the geometry you want to your object. In this case, we are making a scatterplot.
 ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) + geom_point()
 ref

 # All you need to do to add more information to your plot/change things is add
 # on more elements. Lets add a logarithmic scale on the x axis.
 ref <- ref + scale_x_log10()
 ref

 # Lets add a smoothed mean.
 ref + geom_smooth()

 # You can also specify aesthetics inside the call to create geomtery. 
 ref <- ggplot(data) + geom_point(aes(x = bodywt, y = sleep_total)) + scale_x_log10()
 ref
 ref <- ref + geom_smooth()
 ref
 # Why didn't that work? This is because when we specfy aesthetics inside a call 
 # to geomtery it only applies for that layer (only geom_point got the x and y
 # values). The only information that gets passed to all geometery calls is
 # aethetics specified in the initial creation of the ggplot object.

 # So if we wanted that to work, we'd have to do this:
 ggplot(data) + scale_x_log10() + 
  geom_point(aes(x = bodywt, y = sleep_total)) +
  geom_smooth(aes(x = bodywt, y = sleep_total))

 # It's important to note that geometry will automatically use any aesthetic
 # mappings that it understands, and ignore ones it doesn't. So if you specify as
 # much stuff as you can in the inital call that can be used, it'll save you
 # work.

 # Like this:
 ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() + geom_point() + geom_smooth()

 # Let's follow up with a few very common plot/geometry types and mappings you
 # might be interested in:

 # These x and y mappings (and the log scale on the x axis will be used for all later plots).
 plot <- ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10()

 # First lets add color based on what things eat. Note that it automatically adds a legend.
 plot + geom_point(aes(color = vore))
 # We used a factor there, but we can also use a continuous variable for color as well.
 plot + geom_point(aes(color = brainwt))
 # We can change the legend to change the colors in this case.
 plot + geom_point(aes(color = brainwt)) + scale_color_gradient2()
 # Change the colors
 plot + geom_point(aes(color = log(brainwt))) + 
  scale_color_gradient2(low = "green", mid = "yellow", high = "red", 
                        midpoint = -4, na.value = "purple")

 # How about changing size?
 plot + geom_point(aes(size = sleep_rem))
 # Or alpha (add some titles and labels while we're at it)?
 plot + geom_point(aes(alpha = sleep_rem)) + 
  xlab("this is our x axis") + ylab("this is our y axis") + ggtitle("title") + scale_alpha("our legend")

 # If we want to simply change a plot value like marker shape or size without
 # mapping it to data, just specify it outside the call to aesthetics.
 plot + geom_point(aes(shape = vore), size = 6, color = "orange")

 # Let's facet our data by a factor:
 plot + geom_point() + facet_wrap(~vore)

 # Let's put it all together...

 library(scales) 
 # oob specifies what to do with out of bounds values for any scale (normally the
 # value gets changed to NA), "squish" sets them to scale max or min, to use
 # squish you need the "scales" package. 

 ggplot(data, aes(x = bodywt, y = sleep_total, size = log(brainwt), color = sleep_rem)) +
  scale_x_log10("Body weight") + scale_y_continuous("Total sleep (hours)") +
  geom_point() +
  facet_wrap(~ vore, nrow = 1 , ncol = 5) + 
  scale_color_gradient(low = "firebrick1", na.value = "green", limits = c(0,4), oob = squish)

 # Note that we were manipulating aesthetic mappings that geom_point() 
 # understands. To see what it understands, check out either the help for 
 # ?geom_point or its documentation (with examples) at 
 # http://docs.ggplot2.org/current/

 # Now for a few other types of plots:

 # Boxplot... note that stats are automatically performed, more about that later...
 ggplot(data, aes(x = vore, y = sleep_total)) + geom_boxplot()
 ggplot(data, aes(x = vore, y = sleep_total, fill = vore)) + geom_boxplot()

 # 1D density
 ggplot(data, aes(x = sleep_total, fill = vore)) + geom_density(alpha = 0.5)
 # 2D density
 ggplot(data, aes(x = sleep_total, y = sleep_rem)) + geom_density2d()
 # Violin plot
 ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin()
 # Jittered scatterplot
 ggplot(data, aes(x = vore, y = sleep_total)) + geom_jitter(position = position_jitter(width = 0.2))
 # Another method for jittering a scatterplot + violin plot
 ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() + geom_point(position = "jitter")
 # Bar plot 
 ggplot(data, aes(x = vore)) + geom_bar()
 # Note that it automatically is binning the number of values in "vore".

 # Bars are automatically ordered alphabetically (apparently people say that this
 # is not a bug, it's a "feature"...). To reorder a factor:
 reordered <- factor(data$vore, levels = c("herbi","omni","carni", "insecti", NA))
 # Anything that reorders a factor will work to change bar order, order of color labels, etc.
 ggplot() + geom_bar(aes(x = reordered))

 # Let's graph mean sleep/category instead of just the raw number of animals in each category.
 sub <- subset(data, is.na(data$vore) == FALSE)
 categories <- unique(sub$vore)
 sleepMeans <- rep(NA, length(categories))
 names(sleepMeans) <- categories
 sleepSEM <- sleepMeans
 for (cat in categories) {
  sleepMeans[cat] <- mean(sub$sleep_total[sub$vore == cat])
  sleepSEM[cat] <- sd(sub$sleep_total)/sqrt(length(sub$sleep_total[sub$vore == cat]))
 }
 ggplot() + geom_bar(aes(x = sleepMeans, fill = names(sleepMeans)))
 # What happened? geom_bar() and (ggplot2 in general) automatically bins values,
 # which can be really annoying. So it's counting one value for each level of the factor.

 # Use "stat_identity" when calling geom_bar instead (geom_bar() implicitly calls
 # "stat_bin") and map a value to y.
 ggplot() + geom_bar(aes(x = names(sleepMeans), y = sleepMeans, fill = names(sleepMeans)), stat = "identity")

 # Converting to a dataframe for ease-of-use later.
 sleep <- as.data.frame(sleepMeans)
 colnames(sleep) <- c("means")

 # Let's add error bars, we calculated standard error of the mean earlier...
 plot <- ggplot(sleep, aes(x = rownames(sleep), y = means, fill = rownames(sleep),
                          ymin = means - sleepSEM, ymax = means + sleepSEM)) + 
  geom_bar(stat = "identity")
 plot + geom_errorbar()
 # Change errorbar width:
 plot + geom_errorbar(width = 0.5)


 # Let's do an in-depth example (all of this can be applied to other plot types):

 # Reorder bars in descending order of their value
 idx <- order(sleep$means, decreasing = TRUE)
 sleep$name <- factor(rownames(sleep), levels = rownames(sleep)[idx])

 # Create a custom color palette with RColorBrewer
 library(RColorBrewer)
 display.brewer.all()
 palette <- brewer.pal(n = length(rownames(sleep))*2, "Spectral")[seq.int(1,8,2)]
 names(palette) <- levels(sleep$name)

 # Notice that it's just using hexadecimal color codes. You can use a vector of
 # any R colors/hex codes you can think of.
 palette

 example <- ggplot(sleep, aes(x = name, y = means, fill = name,
                  ymin = means - sleepSEM, ymax = means + sleepSEM)) + 
  geom_bar(stat = "identity") + geom_errorbar(width = 0.5) +
  scale_y_continuous(limits = c(0, max(sleep$means)*1.5)) +
  xlab("Food type") + ylab("Average sleep per night (hours)") + 
  scale_fill_manual(values = palette) +
  guides(fill = FALSE) # this kills the redundant legend
 example

 # Change theme elements to white.
 example + theme(panel.background = element_rect(fill = "white"),
                panel.grid.major = element_line(colour = "white"),
                panel.grid.minor = element_line(colour = "white"))
 # Or just change a large number of graphical elements at once to a specified theme:
 example + theme_bw()

 # ggthemes also has an excellent selection of themes to choose from. Check out
 # what's available at: https://github.com/jrnold/ggthemes
 library(ggthemes)
 example + theme_wsj()

 # To save a file use ggsave(). Defaults to last plot made but you can specify a 
 # plot with "plot = plotName" as one of the arguments. File extension is
 # automatically chosen based on filename.
 ggsave(filename = "example.png", width = 10, height = 10, units = "cm")

 # I recommend using the Cairo package when exporting, as it performs
 # antialiasing. This will only make a visible difference in plots with lots of
 # tiny datapoints or complex shapes (ie. not a bar plot).
 library(Cairo)
 ggsave(filename = "example-cairo.png", width = 10, height = 10, units = "cm", type = "cairo-png")

 # So yeah, ggplot2 is a pretty powerful package. To see what's possible, read
 # the documentation at: http://docs.ggplot2.org/current/

 # Also helpful: 
 # http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/
	# biomaRt is a data mining tool that allows you extract massive amounts of data
	# from online databases with minimal effort.

	# This tutorial will use biomaRt to find the average distance between
	# transcription factor binding sites and the genes they are known to act on in
	# Drosophila (let's say we want to know how large an area we need to search
	# up/down from a gene while looking for new binding sites). Interestingly, I
	# don't think there's actually a published value for this, so we're doing
	# something new here.

	# Jeff Stafford

	# Load our starting dataset.
	known_enhancers <- read.delim(file= "oreganno_dmel_full.txt", as.is = TRUE)

	# This file was generated from oreganno_FULL_08Nov10.txt.gz, retrieved from
	# http://www.oreganno.org/oregano/Dump.jsp. Oreganno is a free and open
	# regulatory element database. This is literally all of the Drosophila-related
	# information in the database.
	str(known_enhancers)

	# Bash commands used to extract the Drosophila genes and generate the above file:
	# gunzip oreganno_FULL_08Nov10.txt.gz
	# grep -i 'drosophila melanogaster' > oreganno_dmel_data.txt
	# head -n 1 oreganno_FULL_08Nov10.txt > oreganno_dmel_full.txt
	# cat oreganno_dmel_data.txt >> oreganno_dmel_full.txt

	# We are going to try to calculate the average distance between transcription
	# factors and their targets. To do this, we need the start and end positions of
	# each target.

	# Our database already has the enhancer locations and targets.
	head(cbind(known_enhancers$Gene.name, known_enhancers$chromStart))

	# Now, let's try and retrieve all the information that would normally be in a
	# .bed file for each gene (the gene start and end).

	library(biomaRt)

	# We can browse a list of "marts" to use, generally each one is maintained by a
	# separate organization. We're going to use ensembl, as it generally has the
	# most variety in terms of species.
	martList <- listMarts()
	head(martList)
	ensembl <- useMart("ensembl")

	# Which datasets do we want to use?
	datasets <- listDatasets(ensembl)
	head(datasets)
	ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl")
	# As an important side note, the Oreganno data and ensembl mart are both using
	# the BDGP5 annotation (the current annotation is BDGP6). A lot of things
	# changed between these two versions, so using mismatched annotations would be BAD.

	# What data do we want to retrieve?
	attributes <- listAttributes(ensembl)
	head(attributes)

	# In this case, we want to grab several different versions of each gene name,
	# chromosome each gene is on, and the start and end positions for each.
	bedmap <- getBM(attributes = c("ensembl_gene_id", # FBgn number - changes often, but is highly specific
	"flybasename_gene", # Actual gene names
	"flybasecgid_gene", # CG number - similar to FBgn, but not all datasets have this.
	"chromosome_name",
	"start_position",
	"end_position"),
	filters = "flybasename_gene", # "filters" is the category of values we are searching by
	values = known_enhancers$Gene.name, # "values" are the actual values we are searching with
	mart = ensembl) # Need to specify the mart/dataset we are searching.

	# If something goes wrong, or if Ensembl ever updates its mart to BDGP6, we can
	# also load the data this way:
	# bedmap <- read.csv("TFbedmap.csv", as.is = TRUE)

	# Check to make sure we got an annotation for every gene known to be associated
	# with a transcription factor.
	length(unique(known_enhancers$Gene.ID)) == length(unique(bedmap$flybasename_gene))

	# Upon closer inspection of our TF dataset, much of the mismatch is due to the
	# fact that not all of our binding sites are actually binding sites. The dataset
	# includes regulatory regions not associated with any gene.
	unique(known_enhancers$Type)
	sub <- subset(known_enhancers, known_enhancers$Type == "TRANSCRIPTION FACTOR BINDING SITE")
	sum(is.na(match(bedmap$flybasecgid_gene, sub$Gene.ID)))/length(sub$Gene.ID)
	# All in all, it looks like we are missing about gene start/end information for
	# 11 percent of our genes. This is likely due to an annotation mismatch that I
	# don't particularly feel like troubleshooting at the moment.

	# Match up and annotate our enhancer db with what we know about their target genes.
	tf_idx <- match(known_enhancers$Gene.ID, bedmap$flybasecgid_gene)

	# With this index, we can simply add in our new data from ensembl.
	known_enhancers$gene_start <- bedmap$start_position[tf_idx]
	known_enhancers$gene_end <- bedmap$end_position[tf_idx]

	# Okay so now we know where all of the genes in our TF database start. Lets
	# calculate distances.
	removeNegative <- function(bp) {
	if (is.na(bp)) {
	NA
	}
	else if (bp<0) {
	NA
	} else {
	bp
	}
	}

	known_enhancers$dist_from_start <- known_enhancers$chromStart - known_enhancers$gene_start
	known_enhancers$dist_from_start <- apply(as.matrix(known_enhancers$dist_from_start),1,removeNegative)
	known_enhancers$dist_from_end <- known_enhancers$chromStart - known_enhancers$gene_end
	known_enhancers$dist_from_end <- apply(as.matrix(known_enhancers$dist_from_end),1,removeNegative)

	mean(known_enhancers$dist_from_start, na.rm = TRUE)
	mean(known_enhancers$dist_from_end, na.rm = TRUE)

	range(known_enhancers$dist_from_start, na.rm = TRUE)
	range(known_enhancers$dist_from_end, na.rm = TRUE)

	# That's huge, but it looks like there two TFs with absurdly distant enhancers...
	# I'm going to arbitrarily exclude outliers over 50kb as a result (the stuff
	# we'd probably want to do for downstream analysis won't even go that far
	# anyways).

	start_subset <- subset(known_enhancers, known_enhancers$dist_from_start < 50000)
	hist(start_subset$dist_from_start, breaks = seq(0,50000,1000))
	end_subset <- subset(known_enhancers, known_enhancers$dist_from_end < 50000)
	hist(end_subset$dist_from_end, breaks = seq(0,50000,1000))

	start_mean <- mean(start_subset$dist_from_start, na.rm = TRUE)
	start_sd <- sd(start_subset$dist_from_start, na.rm = TRUE)

	end_mean <- mean(end_subset$dist_from_end, na.rm = TRUE)
	end_sd <- sd(end_subset$dist_from_end, na.rm = TRUE)

	start_mean + start_sd
	end_mean + end_sd
	# It looks like enhancers are generally within 26.5kb of genes in Drosophila.

	length(start_subset$dist_from_start)/length(end_subset$dist_from_end)
	# There are twice as many known enhancers upstream of genes compared to downstream.

	# Again, these are very rough estimates and should be taken with a MASSIVE grain
	# of salt (this is absurdly biased towards genes/enhancers that have been
	# studied more). But hey, we answered our initial question and learned to use
	# biomaRt, right?
	# ggplot2 is a plotting framework that is relatively easy to use, powerful, AND
	# it looks good.

	# Jeff Stafford

	library(ggplot2)

	# Here's the data we're going to use (bundled with ggplot2).
	str(msleep)
	# Looks like it's data on sleep of some kind.
	data <- msleep

	# What is a ggplot2 object? Basically it is your data + information on how to
	# interpret it + the actual geometry it uses to plot it.

	# How to create ggplot2 objects:

	# You can add as much data in the inital function call as you want. All of these
	# work, but the final version is the only "complete" object that fully specifies
	# the data used for the plot.
	ref <- ggplot()
	ref <- ggplot(data)
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

	# To store an object (to add to it later/plot it on demand), just give it a
	# reference. Simply typing the reference will display the plot (if you've
	# provided enough information to make it.)
	ref
	# As you can see, we haven't specified everything we need yet.

	# IMPORTANT: There are 3 components to making a plot with a ggplot object: your
	# data, the aesthetic mappings of your data, and the geometry. If you are
	# missing one, you won't get a functional plot.

	# Your data should be a dataframe with everything you want to plot. Note that it
	# is possible to put data from multiple sources (ie. different dataframes) in
	# the same plot, but it's easier if everything is in the same 2-dimensional
	# dataframe.
	ref <- ggplot(data)

	# The aesthetic mappings tell ggplot2 how to interpret your data. Which values
	# in your dataframe are the y-values, x-values, what should be used for colors, etc.
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

	# The geometry is the actual stuff that goes on the plot. You can specify any
	# geometry as long as you have supplied the values it needs. If you've specified
	# the required aesthetic mappings (which data corresponds to x, y, etc.), all
	# you need to do is tell ggplot2 to create a certain geometry- for instance a
	# scatterplot.

	# Just add the geometry you want to your object. In this case, we are making a scatterplot.
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) + geom_point()
	ref

	# All you need to do to add more information to your plot/change things is add
	# on more elements. Lets add a logarithmic scale on the x axis.
	ref <- ref + scale_x_log10()
	ref

	# Lets add a smoothed mean.
	ref + geom_smooth()

	# You can also specify aesthetics inside the call to create geomtery.
	ref <- ggplot(data) + geom_point(aes(x = bodywt, y = sleep_total)) + scale_x_log10()
	ref
	ref <- ref + geom_smooth()
	ref
	# Why didn't that work? This is because when we specfy aesthetics inside a call
	# to geomtery it only applies for that layer (only geom_point got the x and y
	# values). The only information that gets passed to all geometery calls is
	# aethetics specified in the initial creation of the ggplot object.

	# So if we wanted that to work, we'd have to do this:
	ggplot(data) + scale_x_log10() +
	geom_point(aes(x = bodywt, y = sleep_total)) +
	geom_smooth(aes(x = bodywt, y = sleep_total))

	# It's important to note that geometry will automatically use any aesthetic
	# mappings that it understands, and ignore ones it doesn't. So if you specify as
	# much stuff as you can in the inital call that can be used, it'll save you
	# work.

	# Like this:
	ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() + geom_point() + geom_smooth()

	# Let's follow up with a few very common plot/geometry types and mappings you
	# might be interested in:

	# These x and y mappings (and the log scale on the x axis will be used for all later plots).
	plot <- ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10()

	# First lets add color based on what things eat. Note that it automatically adds a legend.
	plot + geom_point(aes(color = vore))
	# We used a factor there, but we can also use a continuous variable for color as well.
	plot + geom_point(aes(color = brainwt))
	# We can change the legend to change the colors in this case.
	plot + geom_point(aes(color = brainwt)) + scale_color_gradient2()
	# Change the colors
	plot + geom_point(aes(color = log(brainwt))) +
	scale_color_gradient2(low = "green", mid = "yellow", high = "red",
	midpoint = -4, na.value = "purple")

	# How about changing size?
	plot + geom_point(aes(size = sleep_rem))
	# Or alpha (add some titles and labels while we're at it)?
	plot + geom_point(aes(alpha = sleep_rem)) +
	xlab("this is our x axis") + ylab("this is our y axis") + ggtitle("title") + scale_alpha("our legend")

	# If we want to simply change a plot value like marker shape or size without
	# mapping it to data, just specify it outside the call to aesthetics.
	plot + geom_point(aes(shape = vore), size = 6, color = "orange")

	# Let's facet our data by a factor:
	plot + geom_point() + facet_wrap(~vore)

	# Let's put it all together...

	library(scales)
	# oob specifies what to do with out of bounds values for any scale (normally the
	# value gets changed to NA), "squish" sets them to scale max or min, to use
	# squish you need the "scales" package.

	ggplot(data, aes(x = bodywt, y = sleep_total, size = log(brainwt), color = sleep_rem)) +
	scale_x_log10("Body weight") + scale_y_continuous("Total sleep (hours)") +
	geom_point() +
	facet_wrap(~ vore, nrow = 1 , ncol = 5) +
	scale_color_gradient(low = "firebrick1", na.value = "green", limits = c(0,4), oob = squish)

	# Note that we were manipulating aesthetic mappings that geom_point()
	# understands. To see what it understands, check out either the help for
	# ?geom_point or its documentation (with examples) at
	# http://docs.ggplot2.org/current/

	# Now for a few other types of plots:

	# Boxplot... note that stats are automatically performed, more about that later...
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_boxplot()
	ggplot(data, aes(x = vore, y = sleep_total, fill = vore)) + geom_boxplot()

	# 1D density
	ggplot(data, aes(x = sleep_total, fill = vore)) + geom_density(alpha = 0.5)
	# 2D density
	ggplot(data, aes(x = sleep_total, y = sleep_rem)) + geom_density2d()
	# Violin plot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin()
	# Jittered scatterplot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_jitter(position = position_jitter(width = 0.2))
	# Another method for jittering a scatterplot + violin plot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() + geom_point(position = "jitter")
	# Bar plot
	ggplot(data, aes(x = vore)) + geom_bar()
	# Note that it automatically is binning the number of values in "vore".

	# Bars are automatically ordered alphabetically (apparently people say that this
	# is not a bug, it's a "feature"...). To reorder a factor:
	reordered <- factor(data$vore, levels = c("herbi","omni","carni", "insecti", NA))
	# Anything that reorders a factor will work to change bar order, order of color labels, etc.
	ggplot() + geom_bar(aes(x = reordered))

	# Let's graph mean sleep/category instead of just the raw number of animals in each category.
	sub <- subset(data, is.na(data$vore) == FALSE)
	categories <- unique(sub$vore)
	sleepMeans <- rep(NA, length(categories))
	names(sleepMeans) <- categories
	sleepSEM <- sleepMeans
	for (cat in categories) {
	sleepMeans[cat] <- mean(sub$sleep_total[sub$vore == cat])
	sleepSEM[cat] <- sd(sub$sleep_total)/sqrt(length(sub$sleep_total[sub$vore == cat]))
	}
	ggplot() + geom_bar(aes(x = sleepMeans, fill = names(sleepMeans)))
	# What happened? geom_bar() and (ggplot2 in general) automatically bins values,
	# which can be really annoying. So it's counting one value for each level of the factor.

	# Use "stat_identity" when calling geom_bar instead (geom_bar() implicitly calls
	# "stat_bin") and map a value to y.
	ggplot() + geom_bar(aes(x = names(sleepMeans), y = sleepMeans, fill = names(sleepMeans)), stat = "identity")

	# Converting to a dataframe for ease-of-use later.
	sleep <- as.data.frame(sleepMeans)
	colnames(sleep) <- c("means")

	# Let's add error bars, we calculated standard error of the mean earlier...
	plot <- ggplot(sleep, aes(x = rownames(sleep), y = means, fill = rownames(sleep),
	ymin = means - sleepSEM, ymax = means + sleepSEM)) +
	geom_bar(stat = "identity")
	plot + geom_errorbar()
	# Change errorbar width:
	plot + geom_errorbar(width = 0.5)


	# Let's do an in-depth example (all of this can be applied to other plot types):

	# Reorder bars in descending order of their value
	idx <- order(sleep$means, decreasing = TRUE)
	sleep$name <- factor(rownames(sleep), levels = rownames(sleep)[idx])

	# Create a custom color palette with RColorBrewer
	library(RColorBrewer)
	display.brewer.all()
	palette <- brewer.pal(n = length(rownames(sleep))*2, "Spectral")[seq.int(1,8,2)]
	names(palette) <- levels(sleep$name)

	# Notice that it's just using hexadecimal color codes. You can use a vector of
	# any R colors/hex codes you can think of.
	palette

	example <- ggplot(sleep, aes(x = name, y = means, fill = name,
	ymin = means - sleepSEM, ymax = means + sleepSEM)) +
	geom_bar(stat = "identity") + geom_errorbar(width = 0.5) +
	scale_y_continuous(limits = c(0, max(sleep$means)*1.5)) +
	xlab("Food type") + ylab("Average sleep per night (hours)") +
	scale_fill_manual(values = palette) +
	guides(fill = FALSE) # this kills the redundant legend
	example

	# Change theme elements to white.
	example + theme(panel.background = element_rect(fill = "white"),
	panel.grid.major = element_line(colour = "white"),
	panel.grid.minor = element_line(colour = "white"))
	# Or just change a large number of graphical elements at once to a specified theme:
	example + theme_bw()

	# ggthemes also has an excellent selection of themes to choose from. Check out
	# what's available at: https://github.com/jrnold/ggthemes
	library(ggthemes)
	example + theme_wsj()

	# To save a file use ggsave(). Defaults to last plot made but you can specify a
	# plot with "plot = plotName" as one of the arguments. File extension is
	# automatically chosen based on filename.
	ggsave(filename = "example.png", width = 10, height = 10, units = "cm")

	# I recommend using the Cairo package when exporting, as it performs
	# antialiasing. This will only make a visible difference in plots with lots of
	# tiny datapoints or complex shapes (ie. not a bar plot).
	library(Cairo)
	ggsave(filename = "example-cairo.png", width = 10, height = 10, units = "cm", type = "cairo-png")

	# So yeah, ggplot2 is a pretty powerful package. To see what's possible, read
	# the documentation at: http://docs.ggplot2.org/current/

	# Also helpful:
	# http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/