blahah · August 20, 2014 14:58
diff --git a/plots.Rmd b/plots.Rmd
 Transrate assembly and contig plots.
 ========================================================

 Assemblies of mouse with 10, 20, 50 and 100 million reads.

 # Assembly-level stats

 ## Load data

 ```{r}
 setwd('~/Dropbox/ongoing_projects/feeding_transcriptome/final')
 astats <- read.csv('all_10M_assemblies.csv')
 astats$num_pairs <- 10e6
 astats$percent_mapping <- with(astats, total_mappings / num_pairs * 100)
 astats$pc_good_mapping <- with(astats, good_mappings / num_pairs * 100)
 astats$assembly <- gsub(astats$assembly, pattern="\\.corr\\.Trinity\\.fasta", replacement="")
 library(reshape2)
 astats <- melt(astats, id='assembly')
 astats <- astats[complete.cases(astats),]
 astats$assembly <- factor(x=astats$assembly, levels=c("10M", "20M", "50M", "100M"), ordered=T)
 ```

 ## Plot the metrics

 ```{r fig.width=7, fig.height=40}
 library(ggplot2)
 ggplot(astats, aes(x=assembly, y=value, fill=assembly)) + 
  geom_bar(position='dodge', stat='identity') +
  facet_grid(variable ~ ., scales="free") +
  theme(strip.text.y = element_text(angle=0))
 ```

 # Contig-level stats

 ## Load data

 ```{r}
 library(reshape2)
 setwd('~/Dropbox/ongoing_projects/feeding_transcriptome/final')
 cstats <- read.csv('all_10M_10M.corr.Trinity.fasta_contigs.csv')
 cstats$millionreads <- 10
 cstats20 <- read.csv('all_10M_20M.corr.Trinity.fasta_contigs.csv')
 cstats20$millionreads <- 20
 cstats <- rbind(cstats, cstats20)
 cstats50 <- read.csv('all_10M_50M.corr.Trinity.fasta_contigs.csv')
 cstats50$millionreads <- 50
 cstats <- rbind(cstats, cstats50)
 cstats100 <- read.csv('all_10M_100M.corr.Trinity.fasta_contigs.csv')
 cstats100$millionreads <- 100
 cstats <- rbind(cstats, cstats100)
 ```

 Plot the metrics

 ## Contig stats

 ### Length

 ```{r fig.width=12, fig.height=8}
 library(ggplot2)
 ggplot(cstats, aes(x=length, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### GC

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=prop_gc, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### GC skew

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=gc_skew, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### AT skew

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=at_skew, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### CpG count

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=cpg_count, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### Cpg ratio

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=cpg_ratio, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### Orf length

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=orf_length, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ```

 ### Linguistic complexity (k=6)

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=linguistic_complexity_6, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ## Read metrics

 ### Uncovered bases

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=uncovered_bases, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ```

 ### Mean coverage

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=mean_coverage, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ggplot(cstats, aes(y=mean_coverage, x=factor(millionreads))) + 
  geom_violin() +
  labs(colour='Reads (x10^6)') + scale_y_log10()
 ```

 ### In bridges

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=in_bridges, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ```

 ### In bridges

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=in_bridges, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ```

 ### Edit distance per base

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=edit_distance_per_base, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```

 ### Low uniqueness bases

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=low_uniqueness_bases, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)') + scale_x_log10()
 ```

 ### Low uniqueness bases (proportion)

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=p_low_uniqueness_bases, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```


 ## Reference-based metrics

 ### Reference coverage

 ```{r fig.width=12, fig.height=8}
 ggplot(cstats, aes(x=reference_coverage, colour=factor(millionreads))) + 
  geom_density() +
  labs(colour='Reads (x10^6)')
 ```
	Transrate assembly and contig plots.
	========================================================

	Assemblies of mouse with 10, 20, 50 and 100 million reads.

	# Assembly-level stats

	## Load data

	```{r}
	setwd('~/Dropbox/ongoing_projects/feeding_transcriptome/final')
	astats <- read.csv('all_10M_assemblies.csv')
	astats$num_pairs <- 10e6
	astats$percent_mapping <- with(astats, total_mappings / num_pairs * 100)
	astats$pc_good_mapping <- with(astats, good_mappings / num_pairs * 100)
	astats$assembly <- gsub(astats$assembly, pattern="\\.corr\\.Trinity\\.fasta", replacement="")
	library(reshape2)
	astats <- melt(astats, id='assembly')
	astats <- astats[complete.cases(astats),]
	astats$assembly <- factor(x=astats$assembly, levels=c("10M", "20M", "50M", "100M"), ordered=T)
	```

	## Plot the metrics

	```{r fig.width=7, fig.height=40}
	library(ggplot2)
	ggplot(astats, aes(x=assembly, y=value, fill=assembly)) +
	geom_bar(position='dodge', stat='identity') +
	facet_grid(variable ~ ., scales="free") +
	theme(strip.text.y = element_text(angle=0))
	```

	# Contig-level stats

	## Load data

	```{r}
	library(reshape2)
	setwd('~/Dropbox/ongoing_projects/feeding_transcriptome/final')
	cstats <- read.csv('all_10M_10M.corr.Trinity.fasta_contigs.csv')
	cstats$millionreads <- 10
	cstats20 <- read.csv('all_10M_20M.corr.Trinity.fasta_contigs.csv')
	cstats20$millionreads <- 20
	cstats <- rbind(cstats, cstats20)
	cstats50 <- read.csv('all_10M_50M.corr.Trinity.fasta_contigs.csv')
	cstats50$millionreads <- 50
	cstats <- rbind(cstats, cstats50)
	cstats100 <- read.csv('all_10M_100M.corr.Trinity.fasta_contigs.csv')
	cstats100$millionreads <- 100
	cstats <- rbind(cstats, cstats100)
	```

	Plot the metrics

	## Contig stats

	### Length

	```{r fig.width=12, fig.height=8}
	library(ggplot2)
	ggplot(cstats, aes(x=length, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### GC

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=prop_gc, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### GC skew

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=gc_skew, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### AT skew

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=at_skew, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### CpG count

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=cpg_count, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### Cpg ratio

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=cpg_ratio, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### Orf length

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=orf_length, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	```

	### Linguistic complexity (k=6)

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=linguistic_complexity_6, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	## Read metrics

	### Uncovered bases

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=uncovered_bases, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	```

	### Mean coverage

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=mean_coverage, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	ggplot(cstats, aes(y=mean_coverage, x=factor(millionreads))) +
	geom_violin() +
	labs(colour='Reads (x10^6)') + scale_y_log10()
	```

	### In bridges

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=in_bridges, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	```

	### In bridges

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=in_bridges, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	```

	### Edit distance per base

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=edit_distance_per_base, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```

	### Low uniqueness bases

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=low_uniqueness_bases, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)') + scale_x_log10()
	```

	### Low uniqueness bases (proportion)

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=p_low_uniqueness_bases, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```


	## Reference-based metrics

	### Reference coverage

	```{r fig.width=12, fig.height=8}
	ggplot(cstats, aes(x=reference_coverage, colour=factor(millionreads))) +
	geom_density() +
	labs(colour='Reads (x10^6)')
	```