Skip to content

Instantly share code, notes, and snippets.

@blahah
Last active December 4, 2015 15:40
Show Gist options
  • Save blahah/791cdc26823a3ee33388 to your computer and use it in GitHub Desktop.
Save blahah/791cdc26823a3ee33388 to your computer and use it in GitHub Desktop.
Summary statistics and plots of transcriptome assemblies (for ICIPE TReND NGS students)
setwd("~/project_icipe")
library(ggplot2)
# 1. load the files
oases <- read.csv('./oases.csv')
soap <- read.csv('./soapdenovotrans.csv')
# check the number of rows and columns
dim(oases)
dim(soap)
# add an 'assembler' column
oases$assembler <- 'oases'
soap$assembler <- 'soap'
# create a merged data frame
assemblies <- rbind(oases, soap)
# 2. plot number of contigs in each assembly
library(ggplot2)
ggplot(assemblies, aes(x = assembler)) +
geom_bar() +
xlab('Assembler') +
ylab('Number of contigs') +
theme_bw()
# 3. add to the previous plot the proportion of each assembly that had a reference hit
ggplot(assemblies, aes(x = assembler, fill = has_crb)) +
geom_bar() +
xlab('Assembler') +
ylab('Number of contigs') +
theme_bw()
# 4. if a contig matches a reference transcript, the name of the reference
# transcript is in the 'hits' column. How many reference transcripts in total
# were hit across both assemblies?
total_hits <- length(levels(assemblies[!is.na(assemblies$hits), 'hits']))
print(total_hits)
# 5. how many reference transcripts were represented in each assembly?
hits_each <- data.frame(
oases = length(levels(oases[!is.na(oases$hits), 'hits'])),
soap = length(levels(soap[!is.na(soap$hits), 'hits']))
)
print(hits_each)
# 6. how many of the reference transcripts were only found in that assembly?
oases_hits <- levels(oases[!is.na(oases$hits), 'hits'])
soap_hits <- levels(soap[!is.na(soap$hits), 'hits'])
unique_hits <- data.frame(
oases = length(setdiff(oases_hits, soap_hits)),
soap = length(setdiff(soap_hits, oases_hits))
)
print(unique_hits)
# 7. The reference coverage column is the proportion of the reference
# transcript that was reconstructed by this contig. What is the average
# (mean and median) for each assembly?
oases_coverage <- oases[!is.na(oases$reference_coverage), 'reference_coverage']
soap_coverage <- soap[!is.na(soap$reference_coverage), 'reference_coverage']
average_coverage <- data.frame(
calculation = c('mean', 'median'),
oases = c(mean(oases_coverage), median(oases_coverage)),
soap = c(mean(soap_coverage), median(soap_coverage))
)
print(average_coverage)
# 8. Make a bar plot showing how many contigs in each assembly
# had a reference coverage of at least 0.75
ggplot(assemblies['reference_coverage' >= 0.75, ],
aes(x = assembler)) +
geom_bar() +
xlab('Assembler') +
ylab('Contigs with reference coverage >= 75%')
# plot contig score distribution for each assembly
ggplot(assemblies, aes(x = reference_coverage * 100, fill = assembler)) +
geom_histogram() +
facet_grid(assembler ~ .) +
xlim(0, 100) +
guides(fill = FALSE) +
xlab('Reference coverage (%)') +
ylab('Number of contigs')
# 10. We want to know how 'complete' each assembly is.
# The latest human reference transcriptome has 198,634 transcripts.
# Plot the proportion of reference transcripts that were
# found in each assembly.
human_transcripts <- 198634
reference_represented <- data.frame(
assembler = c('oases', 'soap'),
proportion = c(hits_each$oases / human_transcripts,
hits_each$soap / human_transcripts)
)
ggplot(reference_represented, aes(x = assembler, y = proportion * 100)) +
geom_bar(stat = 'identity') +
ylab('Percent of reference transcripts assembled') +
xlab('Assembler')
# 11. Just because a transcript was found, does not mean it was
# assembled well! Multiply the proportions from the previous plot
# by the mean reference coverage, and show these in a new plot.
reference_represented$score <-
unlist(reference_represented$proportion * average_coverage[1, 2:3])
ggplot(reference_represented, aes(x = assembler, y = score * 100)) +
geom_bar(stat = 'identity') +
ylab('Overall score (%)') +
xlab('Assembler')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment