Last active
December 4, 2015 15:40
-
-
Save blahah/791cdc26823a3ee33388 to your computer and use it in GitHub Desktop.
Summary statistics and plots of transcriptome assemblies (for ICIPE TReND NGS students)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("~/project_icipe") | |
library(ggplot2) | |
# 1. load the files | |
oases <- read.csv('./oases.csv') | |
soap <- read.csv('./soapdenovotrans.csv') | |
# check the number of rows and columns | |
dim(oases) | |
dim(soap) | |
# add an 'assembler' column | |
oases$assembler <- 'oases' | |
soap$assembler <- 'soap' | |
# create a merged data frame | |
assemblies <- rbind(oases, soap) | |
# 2. plot number of contigs in each assembly | |
library(ggplot2) | |
ggplot(assemblies, aes(x = assembler)) + | |
geom_bar() + | |
xlab('Assembler') + | |
ylab('Number of contigs') + | |
theme_bw() | |
# 3. add to the previous plot the proportion of each assembly that had a reference hit | |
ggplot(assemblies, aes(x = assembler, fill = has_crb)) + | |
geom_bar() + | |
xlab('Assembler') + | |
ylab('Number of contigs') + | |
theme_bw() | |
# 4. if a contig matches a reference transcript, the name of the reference | |
# transcript is in the 'hits' column. How many reference transcripts in total | |
# were hit across both assemblies? | |
total_hits <- length(levels(assemblies[!is.na(assemblies$hits), 'hits'])) | |
print(total_hits) | |
# 5. how many reference transcripts were represented in each assembly? | |
hits_each <- data.frame( | |
oases = length(levels(oases[!is.na(oases$hits), 'hits'])), | |
soap = length(levels(soap[!is.na(soap$hits), 'hits'])) | |
) | |
print(hits_each) | |
# 6. how many of the reference transcripts were only found in that assembly? | |
oases_hits <- levels(oases[!is.na(oases$hits), 'hits']) | |
soap_hits <- levels(soap[!is.na(soap$hits), 'hits']) | |
unique_hits <- data.frame( | |
oases = length(setdiff(oases_hits, soap_hits)), | |
soap = length(setdiff(soap_hits, oases_hits)) | |
) | |
print(unique_hits) | |
# 7. The reference coverage column is the proportion of the reference | |
# transcript that was reconstructed by this contig. What is the average | |
# (mean and median) for each assembly? | |
oases_coverage <- oases[!is.na(oases$reference_coverage), 'reference_coverage'] | |
soap_coverage <- soap[!is.na(soap$reference_coverage), 'reference_coverage'] | |
average_coverage <- data.frame( | |
calculation = c('mean', 'median'), | |
oases = c(mean(oases_coverage), median(oases_coverage)), | |
soap = c(mean(soap_coverage), median(soap_coverage)) | |
) | |
print(average_coverage) | |
# 8. Make a bar plot showing how many contigs in each assembly | |
# had a reference coverage of at least 0.75 | |
ggplot(assemblies['reference_coverage' >= 0.75, ], | |
aes(x = assembler)) + | |
geom_bar() + | |
xlab('Assembler') + | |
ylab('Contigs with reference coverage >= 75%') | |
# plot contig score distribution for each assembly | |
ggplot(assemblies, aes(x = reference_coverage * 100, fill = assembler)) + | |
geom_histogram() + | |
facet_grid(assembler ~ .) + | |
xlim(0, 100) + | |
guides(fill = FALSE) + | |
xlab('Reference coverage (%)') + | |
ylab('Number of contigs') | |
# 10. We want to know how 'complete' each assembly is. | |
# The latest human reference transcriptome has 198,634 transcripts. | |
# Plot the proportion of reference transcripts that were | |
# found in each assembly. | |
human_transcripts <- 198634 | |
reference_represented <- data.frame( | |
assembler = c('oases', 'soap'), | |
proportion = c(hits_each$oases / human_transcripts, | |
hits_each$soap / human_transcripts) | |
) | |
ggplot(reference_represented, aes(x = assembler, y = proportion * 100)) + | |
geom_bar(stat = 'identity') + | |
ylab('Percent of reference transcripts assembled') + | |
xlab('Assembler') | |
# 11. Just because a transcript was found, does not mean it was | |
# assembled well! Multiply the proportions from the previous plot | |
# by the mean reference coverage, and show these in a new plot. | |
reference_represented$score <- | |
unlist(reference_represented$proportion * average_coverage[1, 2:3]) | |
ggplot(reference_represented, aes(x = assembler, y = score * 100)) + | |
geom_bar(stat = 'identity') + | |
ylab('Overall score (%)') + | |
xlab('Assembler') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment