blahah · December 4, 2015 15:40
diff --git a/summarise_assemblies.R b/summarise_assemblies.R
 setwd("~/project_icipe")
 library(ggplot2)

 # 1. load the files
 oases <- read.csv('./oases.csv')
 soap <- read.csv('./soapdenovotrans.csv')

 # check the number of rows and columns
 dim(oases)
 dim(soap)

 # add an 'assembler' column
 oases$assembler <- 'oases'
 soap$assembler <- 'soap'

 # create a merged data frame
 assemblies <- rbind(oases, soap)

 # 2. plot number of contigs in each assembly
 library(ggplot2)
 ggplot(assemblies, aes(x = assembler)) +
  geom_bar() +
  xlab('Assembler') +
  ylab('Number of contigs') +
  theme_bw()

 # 3. add to the previous plot the proportion of each assembly that had a reference hit
 ggplot(assemblies, aes(x = assembler, fill = has_crb)) +
  geom_bar() +
  xlab('Assembler') +
  ylab('Number of contigs') +
  theme_bw()

 # 4. if a contig matches a reference transcript, the name of the reference
 # transcript is in the 'hits' column. How many reference transcripts in total
 # were hit across both assemblies?
 total_hits <- length(levels(assemblies[!is.na(assemblies$hits), 'hits']))
 print(total_hits)

 # 5. how many reference transcripts were represented in each assembly?
 hits_each <- data.frame(
  oases = length(levels(oases[!is.na(oases$hits), 'hits'])),
  soap = length(levels(soap[!is.na(soap$hits), 'hits']))
 )
 print(hits_each)

 # 6. how many of the reference transcripts were only found in that assembly?
 oases_hits <- levels(oases[!is.na(oases$hits), 'hits'])
 soap_hits <- levels(soap[!is.na(soap$hits), 'hits'])
 unique_hits <- data.frame(
  oases = length(setdiff(oases_hits, soap_hits)),
  soap = length(setdiff(soap_hits, oases_hits))
 )
 print(unique_hits)

 # 7. The reference coverage column is the proportion of the reference
 # transcript that was reconstructed by this contig. What is the average
 # (mean and median) for each assembly?
 oases_coverage <- oases[!is.na(oases$reference_coverage), 'reference_coverage']
 soap_coverage <- soap[!is.na(soap$reference_coverage), 'reference_coverage']
 average_coverage <- data.frame(
  calculation = c('mean', 'median'),
  oases = c(mean(oases_coverage), median(oases_coverage)),
  soap = c(mean(soap_coverage), median(soap_coverage))
 )
 print(average_coverage)

 # 8. Make a bar plot showing how many contigs in each assembly
 # had a reference coverage of at least 0.75
 ggplot(assemblies['reference_coverage' >= 0.75, ],
       aes(x = assembler)) +
  geom_bar() +
  xlab('Assembler') +
  ylab('Contigs with reference coverage >= 75%')

 # plot contig score distribution for each assembly
 ggplot(assemblies, aes(x = reference_coverage * 100, fill = assembler)) +
  geom_histogram() +
  facet_grid(assembler ~ .) +
  xlim(0, 100) +
  guides(fill = FALSE) +
  xlab('Reference coverage (%)') +
  ylab('Number of contigs')

 # 10. We want to know how 'complete' each assembly is.
 # The latest human reference transcriptome has 198,634 transcripts.
 # Plot the proportion of reference transcripts that were
 # found in each assembly.
 human_transcripts <- 198634
 reference_represented <- data.frame(
  assembler = c('oases', 'soap'),
  proportion = c(hits_each$oases / human_transcripts,
                 hits_each$soap / human_transcripts)
 )
 ggplot(reference_represented, aes(x = assembler, y = proportion * 100)) +
  geom_bar(stat = 'identity') +
  ylab('Percent of reference transcripts assembled') +
  xlab('Assembler')

 # 11. Just because a transcript was found, does not mean it was
 # assembled well! Multiply the proportions from the previous plot
 # by the mean reference coverage, and show these in a new plot.
 reference_represented$score <-
  unlist(reference_represented$proportion * average_coverage[1, 2:3])

 ggplot(reference_represented, aes(x = assembler, y = score * 100)) +
  geom_bar(stat = 'identity') +
  ylab('Overall score (%)') +
  xlab('Assembler')
	setwd("~/project_icipe")
	library(ggplot2)

	# 1. load the files
	oases <- read.csv('./oases.csv')
	soap <- read.csv('./soapdenovotrans.csv')

	# check the number of rows and columns
	dim(oases)
	dim(soap)

	# add an 'assembler' column
	oases$assembler <- 'oases'
	soap$assembler <- 'soap'

	# create a merged data frame
	assemblies <- rbind(oases, soap)

	# 2. plot number of contigs in each assembly
	library(ggplot2)
	ggplot(assemblies, aes(x = assembler)) +
	geom_bar() +
	xlab('Assembler') +
	ylab('Number of contigs') +
	theme_bw()

	# 3. add to the previous plot the proportion of each assembly that had a reference hit
	ggplot(assemblies, aes(x = assembler, fill = has_crb)) +
	geom_bar() +
	xlab('Assembler') +
	ylab('Number of contigs') +
	theme_bw()

	# 4. if a contig matches a reference transcript, the name of the reference
	# transcript is in the 'hits' column. How many reference transcripts in total
	# were hit across both assemblies?
	total_hits <- length(levels(assemblies[!is.na(assemblies$hits), 'hits']))
	print(total_hits)

	# 5. how many reference transcripts were represented in each assembly?
	hits_each <- data.frame(
	oases = length(levels(oases[!is.na(oases$hits), 'hits'])),
	soap = length(levels(soap[!is.na(soap$hits), 'hits']))
	)
	print(hits_each)

	# 6. how many of the reference transcripts were only found in that assembly?
	oases_hits <- levels(oases[!is.na(oases$hits), 'hits'])
	soap_hits <- levels(soap[!is.na(soap$hits), 'hits'])
	unique_hits <- data.frame(
	oases = length(setdiff(oases_hits, soap_hits)),
	soap = length(setdiff(soap_hits, oases_hits))
	)
	print(unique_hits)

	# 7. The reference coverage column is the proportion of the reference
	# transcript that was reconstructed by this contig. What is the average
	# (mean and median) for each assembly?
	oases_coverage <- oases[!is.na(oases$reference_coverage), 'reference_coverage']
	soap_coverage <- soap[!is.na(soap$reference_coverage), 'reference_coverage']
	average_coverage <- data.frame(
	calculation = c('mean', 'median'),
	oases = c(mean(oases_coverage), median(oases_coverage)),
	soap = c(mean(soap_coverage), median(soap_coverage))
	)
	print(average_coverage)

	# 8. Make a bar plot showing how many contigs in each assembly
	# had a reference coverage of at least 0.75
	ggplot(assemblies['reference_coverage' >= 0.75, ],
	aes(x = assembler)) +
	geom_bar() +
	xlab('Assembler') +
	ylab('Contigs with reference coverage >= 75%')

	# plot contig score distribution for each assembly
	ggplot(assemblies, aes(x = reference_coverage * 100, fill = assembler)) +
	geom_histogram() +
	facet_grid(assembler ~ .) +
	xlim(0, 100) +
	guides(fill = FALSE) +
	xlab('Reference coverage (%)') +
	ylab('Number of contigs')

	# 10. We want to know how 'complete' each assembly is.
	# The latest human reference transcriptome has 198,634 transcripts.
	# Plot the proportion of reference transcripts that were
	# found in each assembly.
	human_transcripts <- 198634
	reference_represented <- data.frame(
	assembler = c('oases', 'soap'),
	proportion = c(hits_each$oases / human_transcripts,
	hits_each$soap / human_transcripts)
	)
	ggplot(reference_represented, aes(x = assembler, y = proportion * 100)) +
	geom_bar(stat = 'identity') +
	ylab('Percent of reference transcripts assembled') +
	xlab('Assembler')

	# 11. Just because a transcript was found, does not mean it was
	# assembled well! Multiply the proportions from the previous plot
	# by the mean reference coverage, and show these in a new plot.
	reference_represented$score <-
	unlist(reference_represented$proportion * average_coverage[1, 2:3])

	ggplot(reference_represented, aes(x = assembler, y = score * 100)) +
	geom_bar(stat = 'identity') +
	ylab('Overall score (%)') +
	xlab('Assembler')