Created
March 8, 2015 02:09
-
-
Save jstaf/393db724d6c3e8e68fc4 to your computer and use it in GitHub Desktop.
R Club - March 10
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# biomaRt is a data mining tool that allows you extract massive amounts of data | |
# from online databases with minimal effort. | |
# This tutorial will use biomaRt to find the average distance between | |
# transcription factor binding sites and the genes they are known to act on in | |
# Drosophila (let's say we want to know how large an area we need to search | |
# up/down from a gene while looking for new binding sites). Interestingly, I | |
# don't think there's actually a published value for this, so we're doing | |
# something new here. | |
# Jeff Stafford | |
# Load our starting dataset. | |
known_enhancers <- read.delim(file= "oreganno_dmel_full.txt", as.is = TRUE) | |
# This file was generated from oreganno_FULL_08Nov10.txt.gz, retrieved from | |
# http://www.oreganno.org/oregano/Dump.jsp. Oreganno is a free and open | |
# regulatory element database. This is literally all of the Drosophila-related | |
# information in the database. | |
str(known_enhancers) | |
# Bash commands used to extract the Drosophila genes and generate the above file: | |
# gunzip oreganno_FULL_08Nov10.txt.gz | |
# grep -i 'drosophila melanogaster' > oreganno_dmel_data.txt | |
# head -n 1 oreganno_FULL_08Nov10.txt > oreganno_dmel_full.txt | |
# cat oreganno_dmel_data.txt >> oreganno_dmel_full.txt | |
# We are going to try to calculate the average distance between transcription | |
# factors and their targets. To do this, we need the start and end positions of | |
# each target. | |
# Our database already has the enhancer locations and targets. | |
head(cbind(known_enhancers$Gene.name, known_enhancers$chromStart)) | |
# Now, let's try and retrieve all the information that would normally be in a | |
# .bed file for each gene (the gene start and end). | |
library(biomaRt) | |
# We can browse a list of "marts" to use, generally each one is maintained by a | |
# separate organization. We're going to use ensembl, as it generally has the | |
# most variety in terms of species. | |
martList <- listMarts() | |
head(martList) | |
ensembl <- useMart("ensembl") | |
# Which datasets do we want to use? | |
datasets <- listDatasets(ensembl) | |
head(datasets) | |
ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl") | |
# As an important side note, the Oreganno data and ensembl mart are both using | |
# the BDGP5 annotation (the current annotation is BDGP6). A lot of things | |
# changed between these two versions, so using mismatched annotations would be BAD. | |
# What data do we want to retrieve? | |
attributes <- listAttributes(ensembl) | |
head(attributes) | |
# In this case, we want to grab several different versions of each gene name, | |
# chromosome each gene is on, and the start and end positions for each. | |
bedmap <- getBM(attributes = c("ensembl_gene_id", # FBgn number - changes often, but is highly specific | |
"flybasename_gene", # Actual gene names | |
"flybasecgid_gene", # CG number - similar to FBgn, but not all datasets have this. | |
"chromosome_name", | |
"start_position", | |
"end_position"), | |
filters = "flybasename_gene", # "filters" is the category of values we are searching by | |
values = known_enhancers$Gene.name, # "values" are the actual values we are searching with | |
mart = ensembl) # Need to specify the mart/dataset we are searching. | |
# If something goes wrong, or if Ensembl ever updates its mart to BDGP6, we can | |
# also load the data this way: | |
# bedmap <- read.csv("TFbedmap.csv", as.is = TRUE) | |
# Check to make sure we got an annotation for every gene known to be associated | |
# with a transcription factor. | |
length(unique(known_enhancers$Gene.ID)) == length(unique(bedmap$flybasename_gene)) | |
# Upon closer inspection of our TF dataset, much of the mismatch is due to the | |
# fact that not all of our binding sites are actually binding sites. The dataset | |
# includes regulatory regions not associated with any gene. | |
unique(known_enhancers$Type) | |
sub <- subset(known_enhancers, known_enhancers$Type == "TRANSCRIPTION FACTOR BINDING SITE") | |
sum(is.na(match(bedmap$flybasecgid_gene, sub$Gene.ID)))/length(sub$Gene.ID) | |
# All in all, it looks like we are missing about gene start/end information for | |
# 11 percent of our genes. This is likely due to an annotation mismatch that I | |
# don't particularly feel like troubleshooting at the moment. | |
# Match up and annotate our enhancer db with what we know about their target genes. | |
tf_idx <- match(known_enhancers$Gene.ID, bedmap$flybasecgid_gene) | |
# With this index, we can simply add in our new data from ensembl. | |
known_enhancers$gene_start <- bedmap$start_position[tf_idx] | |
known_enhancers$gene_end <- bedmap$end_position[tf_idx] | |
# Okay so now we know where all of the genes in our TF database start. Lets | |
# calculate distances. | |
removeNegative <- function(bp) { | |
if (is.na(bp)) { | |
NA | |
} | |
else if (bp<0) { | |
NA | |
} else { | |
bp | |
} | |
} | |
known_enhancers$dist_from_start <- known_enhancers$chromStart - known_enhancers$gene_start | |
known_enhancers$dist_from_start <- apply(as.matrix(known_enhancers$dist_from_start),1,removeNegative) | |
known_enhancers$dist_from_end <- known_enhancers$chromStart - known_enhancers$gene_end | |
known_enhancers$dist_from_end <- apply(as.matrix(known_enhancers$dist_from_end),1,removeNegative) | |
mean(known_enhancers$dist_from_start, na.rm = TRUE) | |
mean(known_enhancers$dist_from_end, na.rm = TRUE) | |
range(known_enhancers$dist_from_start, na.rm = TRUE) | |
range(known_enhancers$dist_from_end, na.rm = TRUE) | |
# That's huge, but it looks like there two TFs with absurdly distant enhancers... | |
# I'm going to arbitrarily exclude outliers over 50kb as a result (the stuff | |
# we'd probably want to do for downstream analysis won't even go that far | |
# anyways). | |
start_subset <- subset(known_enhancers, known_enhancers$dist_from_start < 50000) | |
hist(start_subset$dist_from_start, breaks = seq(0,50000,1000)) | |
end_subset <- subset(known_enhancers, known_enhancers$dist_from_end < 50000) | |
hist(end_subset$dist_from_end, breaks = seq(0,50000,1000)) | |
start_mean <- mean(start_subset$dist_from_start, na.rm = TRUE) | |
start_sd <- sd(start_subset$dist_from_start, na.rm = TRUE) | |
end_mean <- mean(end_subset$dist_from_end, na.rm = TRUE) | |
end_sd <- sd(end_subset$dist_from_end, na.rm = TRUE) | |
start_mean + start_sd | |
end_mean + end_sd | |
# It looks like enhancers are generally within 26.5kb of genes in Drosophila. | |
length(start_subset$dist_from_start)/length(end_subset$dist_from_end) | |
# There are twice as many known enhancers upstream of genes compared to downstream. | |
# Again, these are very rough estimates and should be taken with a MASSIVE grain | |
# of salt (this is absurdly biased towards genes/enhancers that have been | |
# studied more). But hey, we answered our initial question and learned to use | |
# biomaRt, right? | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ggplot2 is a plotting framework that is relatively easy to use, powerful, AND | |
# it looks good. | |
# Jeff Stafford | |
library(ggplot2) | |
# Here's the data we're going to use (bundled with ggplot2). | |
str(msleep) | |
# Looks like it's data on sleep of some kind. | |
data <- msleep | |
# What is a ggplot2 object? Basically it is your data + information on how to | |
# interpret it + the actual geometry it uses to plot it. | |
# How to create ggplot2 objects: | |
# You can add as much data in the inital function call as you want. All of these | |
# work, but the final version is the only "complete" object that fully specifies | |
# the data used for the plot. | |
ref <- ggplot() | |
ref <- ggplot(data) | |
ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) | |
# To store an object (to add to it later/plot it on demand), just give it a | |
# reference. Simply typing the reference will display the plot (if you've | |
# provided enough information to make it.) | |
ref | |
# As you can see, we haven't specified everything we need yet. | |
# IMPORTANT: There are 3 components to making a plot with a ggplot object: your | |
# data, the aesthetic mappings of your data, and the geometry. If you are | |
# missing one, you won't get a functional plot. | |
# Your data should be a dataframe with everything you want to plot. Note that it | |
# is possible to put data from multiple sources (ie. different dataframes) in | |
# the same plot, but it's easier if everything is in the same 2-dimensional | |
# dataframe. | |
ref <- ggplot(data) | |
# The aesthetic mappings tell ggplot2 how to interpret your data. Which values | |
# in your dataframe are the y-values, x-values, what should be used for colors, etc. | |
ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) | |
# The geometry is the actual stuff that goes on the plot. You can specify any | |
# geometry as long as you have supplied the values it needs. If you've specified | |
# the required aesthetic mappings (which data corresponds to x, y, etc.), all | |
# you need to do is tell ggplot2 to create a certain geometry- for instance a | |
# scatterplot. | |
# Just add the geometry you want to your object. In this case, we are making a scatterplot. | |
ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) + geom_point() | |
ref | |
# All you need to do to add more information to your plot/change things is add | |
# on more elements. Lets add a logarithmic scale on the x axis. | |
ref <- ref + scale_x_log10() | |
ref | |
# Lets add a smoothed mean. | |
ref + geom_smooth() | |
# You can also specify aesthetics inside the call to create geomtery. | |
ref <- ggplot(data) + geom_point(aes(x = bodywt, y = sleep_total)) + scale_x_log10() | |
ref | |
ref <- ref + geom_smooth() | |
ref | |
# Why didn't that work? This is because when we specfy aesthetics inside a call | |
# to geomtery it only applies for that layer (only geom_point got the x and y | |
# values). The only information that gets passed to all geometery calls is | |
# aethetics specified in the initial creation of the ggplot object. | |
# So if we wanted that to work, we'd have to do this: | |
ggplot(data) + scale_x_log10() + | |
geom_point(aes(x = bodywt, y = sleep_total)) + | |
geom_smooth(aes(x = bodywt, y = sleep_total)) | |
# It's important to note that geometry will automatically use any aesthetic | |
# mappings that it understands, and ignore ones it doesn't. So if you specify as | |
# much stuff as you can in the inital call that can be used, it'll save you | |
# work. | |
# Like this: | |
ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() + geom_point() + geom_smooth() | |
# Let's follow up with a few very common plot/geometry types and mappings you | |
# might be interested in: | |
# These x and y mappings (and the log scale on the x axis will be used for all later plots). | |
plot <- ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() | |
# First lets add color based on what things eat. Note that it automatically adds a legend. | |
plot + geom_point(aes(color = vore)) | |
# We used a factor there, but we can also use a continuous variable for color as well. | |
plot + geom_point(aes(color = brainwt)) | |
# We can change the legend to change the colors in this case. | |
plot + geom_point(aes(color = brainwt)) + scale_color_gradient2() | |
# Change the colors | |
plot + geom_point(aes(color = log(brainwt))) + | |
scale_color_gradient2(low = "green", mid = "yellow", high = "red", | |
midpoint = -4, na.value = "purple") | |
# How about changing size? | |
plot + geom_point(aes(size = sleep_rem)) | |
# Or alpha (add some titles and labels while we're at it)? | |
plot + geom_point(aes(alpha = sleep_rem)) + | |
xlab("this is our x axis") + ylab("this is our y axis") + ggtitle("title") + scale_alpha("our legend") | |
# If we want to simply change a plot value like marker shape or size without | |
# mapping it to data, just specify it outside the call to aesthetics. | |
plot + geom_point(aes(shape = vore), size = 6, color = "orange") | |
# Let's facet our data by a factor: | |
plot + geom_point() + facet_wrap(~vore) | |
# Let's put it all together... | |
library(scales) | |
# oob specifies what to do with out of bounds values for any scale (normally the | |
# value gets changed to NA), "squish" sets them to scale max or min, to use | |
# squish you need the "scales" package. | |
ggplot(data, aes(x = bodywt, y = sleep_total, size = log(brainwt), color = sleep_rem)) + | |
scale_x_log10("Body weight") + scale_y_continuous("Total sleep (hours)") + | |
geom_point() + | |
facet_wrap(~ vore, nrow = 1 , ncol = 5) + | |
scale_color_gradient(low = "firebrick1", na.value = "green", limits = c(0,4), oob = squish) | |
# Note that we were manipulating aesthetic mappings that geom_point() | |
# understands. To see what it understands, check out either the help for | |
# ?geom_point or its documentation (with examples) at | |
# http://docs.ggplot2.org/current/ | |
# Now for a few other types of plots: | |
# Boxplot... note that stats are automatically performed, more about that later... | |
ggplot(data, aes(x = vore, y = sleep_total)) + geom_boxplot() | |
ggplot(data, aes(x = vore, y = sleep_total, fill = vore)) + geom_boxplot() | |
# 1D density | |
ggplot(data, aes(x = sleep_total, fill = vore)) + geom_density(alpha = 0.5) | |
# 2D density | |
ggplot(data, aes(x = sleep_total, y = sleep_rem)) + geom_density2d() | |
# Violin plot | |
ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() | |
# Jittered scatterplot | |
ggplot(data, aes(x = vore, y = sleep_total)) + geom_jitter(position = position_jitter(width = 0.2)) | |
# Another method for jittering a scatterplot + violin plot | |
ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() + geom_point(position = "jitter") | |
# Bar plot | |
ggplot(data, aes(x = vore)) + geom_bar() | |
# Note that it automatically is binning the number of values in "vore". | |
# Bars are automatically ordered alphabetically (apparently people say that this | |
# is not a bug, it's a "feature"...). To reorder a factor: | |
reordered <- factor(data$vore, levels = c("herbi","omni","carni", "insecti", NA)) | |
# Anything that reorders a factor will work to change bar order, order of color labels, etc. | |
ggplot() + geom_bar(aes(x = reordered)) | |
# Let's graph mean sleep/category instead of just the raw number of animals in each category. | |
sub <- subset(data, is.na(data$vore) == FALSE) | |
categories <- unique(sub$vore) | |
sleepMeans <- rep(NA, length(categories)) | |
names(sleepMeans) <- categories | |
sleepSEM <- sleepMeans | |
for (cat in categories) { | |
sleepMeans[cat] <- mean(sub$sleep_total[sub$vore == cat]) | |
sleepSEM[cat] <- sd(sub$sleep_total)/sqrt(length(sub$sleep_total[sub$vore == cat])) | |
} | |
ggplot() + geom_bar(aes(x = sleepMeans, fill = names(sleepMeans))) | |
# What happened? geom_bar() and (ggplot2 in general) automatically bins values, | |
# which can be really annoying. So it's counting one value for each level of the factor. | |
# Use "stat_identity" when calling geom_bar instead (geom_bar() implicitly calls | |
# "stat_bin") and map a value to y. | |
ggplot() + geom_bar(aes(x = names(sleepMeans), y = sleepMeans, fill = names(sleepMeans)), stat = "identity") | |
# Converting to a dataframe for ease-of-use later. | |
sleep <- as.data.frame(sleepMeans) | |
colnames(sleep) <- c("means") | |
# Let's add error bars, we calculated standard error of the mean earlier... | |
plot <- ggplot(sleep, aes(x = rownames(sleep), y = means, fill = rownames(sleep), | |
ymin = means - sleepSEM, ymax = means + sleepSEM)) + | |
geom_bar(stat = "identity") | |
plot + geom_errorbar() | |
# Change errorbar width: | |
plot + geom_errorbar(width = 0.5) | |
# Let's do an in-depth example (all of this can be applied to other plot types): | |
# Reorder bars in descending order of their value | |
idx <- order(sleep$means, decreasing = TRUE) | |
sleep$name <- factor(rownames(sleep), levels = rownames(sleep)[idx]) | |
# Create a custom color palette with RColorBrewer | |
library(RColorBrewer) | |
display.brewer.all() | |
palette <- brewer.pal(n = length(rownames(sleep))*2, "Spectral")[seq.int(1,8,2)] | |
names(palette) <- levels(sleep$name) | |
# Notice that it's just using hexadecimal color codes. You can use a vector of | |
# any R colors/hex codes you can think of. | |
palette | |
example <- ggplot(sleep, aes(x = name, y = means, fill = name, | |
ymin = means - sleepSEM, ymax = means + sleepSEM)) + | |
geom_bar(stat = "identity") + geom_errorbar(width = 0.5) + | |
scale_y_continuous(limits = c(0, max(sleep$means)*1.5)) + | |
xlab("Food type") + ylab("Average sleep per night (hours)") + | |
scale_fill_manual(values = palette) + | |
guides(fill = FALSE) # this kills the redundant legend | |
example | |
# Change theme elements to white. | |
example + theme(panel.background = element_rect(fill = "white"), | |
panel.grid.major = element_line(colour = "white"), | |
panel.grid.minor = element_line(colour = "white")) | |
# Or just change a large number of graphical elements at once to a specified theme: | |
example + theme_bw() | |
# ggthemes also has an excellent selection of themes to choose from. Check out | |
# what's available at: https://github.com/jrnold/ggthemes | |
library(ggthemes) | |
example + theme_wsj() | |
# To save a file use ggsave(). Defaults to last plot made but you can specify a | |
# plot with "plot = plotName" as one of the arguments. File extension is | |
# automatically chosen based on filename. | |
ggsave(filename = "example.png", width = 10, height = 10, units = "cm") | |
# I recommend using the Cairo package when exporting, as it performs | |
# antialiasing. This will only make a visible difference in plots with lots of | |
# tiny datapoints or complex shapes (ie. not a bar plot). | |
library(Cairo) | |
ggsave(filename = "example-cairo.png", width = 10, height = 10, units = "cm", type = "cairo-png") | |
# So yeah, ggplot2 is a pretty powerful package. To see what's possible, read | |
# the documentation at: http://docs.ggplot2.org/current/ | |
# Also helpful: | |
# http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment