Created
February 24, 2016 13:41
-
-
Save ohofmann/3e32236166f26f638841 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "QC Report" | |
author: "Automatically generated" | |
date: "tba" | |
output: | |
html_document: | |
toc: true | |
theme: united | |
--- | |
```{r custom} | |
library(ggplot2) | |
library(pheatmap) | |
library(scales) | |
library(gridExtra) | |
library(gtools) | |
library(RColorBrewer) | |
library(knitr) | |
library(tidyr) | |
library(dplyr) | |
library(reshape) | |
library(rmarkdown) | |
library(dplyr) | |
library(DT) | |
number_ticks <- function(n) {function(limits) pretty(limits, n)} | |
options(bitmapType = 'cairo') | |
path_results = getwd() | |
``` | |
```{r create-report, echo=FALSE, eval=FALSE} | |
knitr::opts_chunk$set(tidy=TRUE, highlight=TRUE, dev="png", | |
cache=FALSE, highlight=TRUE, autodep=TRUE, warning=FALSE, | |
error=FALSE, eval=TRUE, fig.width= 9, echo=FALSE, | |
message=FALSE, prompt=TRUE, comment='', fig.cap='', | |
bootstrap.show.code=FALSE) | |
render(file.path(path_results, "report-briTest.Rmd")) | |
``` | |
## Basic sample metrics | |
A quick summary of sample metrics to identify outliers. Offtargets will be set to 0 for non-targeted experiments, and table cells are color-coded to indivate deviation from the mean for a given metric. Note that this may be expected depending on the experimental setup. | |
```{r table, results='asis'} | |
qc = read.table(file.path(path_results, "metrics", "metrics.tsv"), | |
header=T, sep="\t", check.names=F, | |
colClasses=list("sample"="character")) | |
rownames(qc) = qc$sample | |
# Define metrics to display | |
metrics = c("sample", "Total_reads" ,"Mapped_reads_pct", "Duplicates_pct", | |
"offtarget", "%GC", "Sequence_length", "Median_insert_size") | |
if (is.null(qc$offtarget)) { | |
qc$offtarget <- rep (0, length(qc$sample)) | |
} else { | |
qc$offtarget = qc$offtarget/qc$Total_reads | |
} | |
# Adjust some of the text information for formatting purposes | |
qc_table <- qc[, metrics] | |
qc_table$Mapped_reads_pct = as.numeric(gsub("%", "", qc_table$Mapped_reads_pct)) | |
qc_table$Mapped_reads_pct <- qc_table$Mapped_reads_pct / 100 | |
qc_table$Duplicates_pct = as.numeric(gsub("%", "", qc_table$Duplicates_pct)) | |
qc_table$Duplicates_pct <- qc_table$Duplicates_pct / 100 | |
# Calculate mean and SD where appropriate | |
datatable(qc_table, | |
rownames=FALSE, | |
options=list(dom = 'tp', | |
pageLength = length(qc_table$sample))) %>% | |
formatPercentage('Mapped_reads_pct', 1) %>% | |
formatPercentage('Duplicates_pct', 1) %>% | |
formatPercentage('offtarget', 1) %>% | |
formatStyle('Total_reads', | |
backgroundColor = styleInterval(c(mean(qc_table$Total_reads) - | |
2 * sd(qc_table$Total_reads), | |
mean(qc_table$Total_reads) - | |
sd(qc_table$Total_reads), | |
mean(qc_table$Total_reads)), | |
c('#f03b20', | |
'#feb24c', | |
'#ffeda0', | |
'white'))) %>% | |
formatStyle('Mapped_reads_pct', | |
backgroundColor = styleInterval(c(mean(qc_table$Mapped_reads_pct) - | |
2 * sd(qc_table$Mapped_reads_pct), | |
mean(qc_table$Mapped_reads_pct) - | |
sd(qc_table$Mapped_reads_pct), | |
mean(qc_table$Mapped_reads_pct)), | |
c('#f03b20', | |
'#feb24c', | |
'#ffeda0', | |
'white'))) %>% | |
formatStyle('Duplicates_pct', | |
backgroundColor = styleInterval(c(mean(qc_table$Duplicates_pct), | |
mean(qc_table$Duplicates_pct) + | |
sd(qc_table$Duplicates_pct), | |
mean(qc_table$Duplicates_pct) + | |
2 * sd(qc_table$Duplicates_pct)), | |
c('white', | |
'#ffeda0', | |
'#feb24c', | |
'#f03b20' | |
))) %>% | |
formatStyle('%GC', | |
backgroundColor = styleInterval(c(mean(qc_table$"%GC") - | |
2 * sd(qc_table$"%GC"), | |
mean(qc_table$"%GC") - | |
sd(qc_table$"%GC"), | |
mean(qc_table$"%GC") + | |
sd(qc_table$"%GC"), | |
mean(qc_table$"%GC") + | |
2 * sd(qc_table$"%GC")), | |
c('#feb24c', | |
'#ffeda0', | |
'white', | |
'#ffeda0', | |
'#feb24c' | |
))) %>% | |
formatStyle('offtarget', | |
backgroundColor = styleInterval(c(mean(qc_table$offtarget), | |
mean(qc_table$offtarget) + | |
sd(qc_table$offtarget), | |
mean(qc_table$offtarget) + | |
2 * sd(qc_table$offtarget)), | |
c('white', | |
'#ffeda0', | |
'#feb24c', | |
'#f03b20' | |
))) %>% | |
formatStyle('Median_insert_size', | |
backgroundColor = styleInterval(c(mean(qc_table$Median_insert_size) - | |
2 * sd(qc_table$Median_insert_size), | |
mean(qc_table$Median_insert_size) - | |
sd(qc_table$Median_insert_size), | |
mean(qc_table$Median_insert_size) + | |
sd(qc_table$Median_insert_size), | |
mean(qc_table$Median_insert_size) + | |
2 * sd(qc_table$Median_insert_size)), | |
c('#feb24c', | |
'#ffeda0', | |
'white', | |
'#ffeda0', | |
'#feb24c' | |
))) | |
``` | |
### Total and mapped reads | |
The next two plots compare the number of reads in each sample (should be uniform) and the percentage of reads mapping to the reference genome. Low mapping rates are indicative of sample contamination, poor sequencing quality or other artifacts. | |
```{r total-reads} | |
ggplot(qc, aes(x=sample, y=(Total_reads)/1e6)) + | |
geom_bar(stat = 'identity') + | |
ylab("Million reads") + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
``` | |
```{r mapped-reads} | |
ggplot(qc, aes(x=sample, y=Mapped_reads/Total_reads)) + | |
geom_bar(stat = 'identity') + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
``` | |
The _effective_ number of reads is the total number of mapped read adjusted for duplicates; a high duplicate rate can reduce the overall coverage that will contribute to variant calling processes: | |
```{r effective-reads} | |
ggplot(qc, aes(x=sample, y=(Total_reads-Duplicates)/1e6)) + | |
geom_bar(stat = 'identity') + | |
ylab("Million reads") + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
``` | |
### Offtarget reads | |
In addition to total read count and map-ability it is important to know if the target enrichment (if any) worked, i.e., what percentage of reads are on the amplified or captured regions, and what percentage is considered 'off target'. On target percentages should be uniform and ideally above 50% for most capture methods. | |
```{r off-reads} | |
ggplot(qc, aes(x=sample, y=offtarget_pct*100)) + | |
geom_bar(stat = 'identity') + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
``` | |
## Sequencing metrics | |
Various metrics related to the quality of the sequencing data itself, i.e., the FASTQ or BAM fie as it came off the sequencer. | |
### Mean sequence quality along read | |
Mean base quality for each sample along with the upper and lower quartile, and the 10% (lower) quantile. A slight dropoff towards the end of the read is normal, but you want the mean qualities and quartiles to track each other closely with relatively few outliers. In addition, all plots should look similar to each other. | |
```{r read-qual} | |
qual = read.table(file.path(path_results, "fastqc", "Per_base_sequence_quality.tsv"), | |
header=T, sep= "\t", check.names =T, | |
colClasses=list("sample"="character")) | |
qual$sample = as.character(qual$sample) | |
ggplot(qual, aes(x=Base, y=Median, group=sample)) + | |
geom_line() + | |
geom_line(aes(x=Base, y=Lower_Quartile, group=sample, col="lower_quantile")) + | |
geom_line(aes(x=Base, y=Upper_Quartile, group=sample, col="upper_quantile")) + | |
geom_line(aes(x=Base, y=X10th_Percentile, group=sample, col="10%_quantile")) + | |
facet_wrap(~sample) + | |
ylim(0,45) + | |
scale_color_brewer("metrics",palette = "Set1") + | |
theme_bw() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5))+ | |
scale_x_discrete(breaks=number_ticks(10)) | |
``` | |
### Read length distribution | |
This should ideally be uniform across all samples with one distinct read length dominating the distribution. | |
```{r read-size} | |
qual = read.table(file.path(path_results, "fastqc", "Sequence_Length_Distribution.tsv"), | |
header=T, sep= "\t", check.names = F, | |
colClasses=list("sample"="character")) | |
qual = qual %>% group_by(sample) %>% mutate(total=sum(Count), pct=Count/total) | |
ggplot(qual , aes(x=Length, y=Count, group=sample)) + | |
geom_line(size=2, alpha=0.5) + | |
scale_color_brewer(palette = "Set1") + | |
theme_bw() + | |
facet_wrap(~sample) + | |
labs(y="# of reads")+ | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) + | |
scale_x_discrete(breaks=number_ticks(5)) | |
``` | |
The following table lists for each sample the read lengths contributing more than 10% of that sample's total number of reads: | |
```{r table-size, results='asis'} | |
kable(qual %>% filter(pct > 0.10) %>% dplyr::select(Length, sample, pct) %>% spread(Length, pct), align="c", digits=2) | |
``` | |
### Read GC content | |
For re-sequencing projects the GC nucleotide content of sequenced reads should follow the genome distribution. Here we are checking if there are any samples where reads outside of a 10-90% GC content contribute more than 5% of the overall reads. These cutoffs can be tweaked as needed based on the G/C distribution graph: | |
```{r table-gc, results='asis'} | |
qual = read.table(file.path(path_results, "fastqc", "Per_sequence_GC_content.tsv"), | |
header=T, sep= "\t", check.names = F, | |
colClasses=list("sample"="character")) | |
qual = qual %>% group_by(sample) %>% mutate(total=sum(Count), pct=Count/total) | |
ggplot(qual, aes(x=GC_Content, y=pct, group=sample)) + | |
geom_line(size=2, alpha=0.5) + | |
scale_color_brewer(palette = "Set1") + | |
theme_bw() + | |
labs(y='Percent of reads', x='Percent G/C content') | |
qual = qual %>% filter(GC_Content>10 & GC_Content<90) %>% | |
group_by(sample) %>% summarise(pct_in_10_90=sum(pct)) | |
kable(qual %>% filter(pct_in_10_90 <= 0.95)) | |
``` | |
### Read nucleotide content | |
Expanding on the GC read content analysis this checks for biases in nucleotide content along the position in the read. Typically, biases are introduced to due preferential (non-random) primer binding (at the beginning of the read) or other artifacts; strong biases are indicative of technical problems: | |
```{r read-content} | |
qual = read.table(file.path(path_results, "fastqc", "Per_base_sequence_content.tsv"), | |
header=T, sep= "\t", check.names = F, | |
colClasses=list("sample"="character")) | |
qual$sample = as.character(qual$sample) | |
qual$Base = as.numeric(qual$Base) | |
dd = melt(qual, id.vars = c("Base", "sample"), variable_name = c("nt")) | |
ggplot(dd %>% filter(nt!="total"), aes(x=Base, y=value, group=sample)) + | |
geom_line(size=2, alpha=0.5) + | |
theme_bw() + | |
ylab("% of nucleotides") + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) + | |
ylim(10,50) + | |
facet_wrap(~nt) | |
``` | |
### Read nuceotide biases | |
To help trace down possible adapter contamination, missed barcodes or other biases we list for each sample the read positions where any nucleotide is present in either less than 10% or more than 30% of the reads: | |
```{r table-content, results='asis'} | |
kable( melt(qual, id.vars=c("Base", "sample"), variable_name = "nt") %>% | |
filter(value > 30 | value < 10) %>% filter(Base<20) %>% | |
dplyr::select(Base, sample, nt, value) %>% | |
spread(Base, value), | |
align="c", digits=2) | |
``` | |
## Coverage in regions | |
```{r total-coverage-load} | |
get_quantile_cov = function(path){ | |
cov_tab = data.frame() | |
cov_regions = data.frame() | |
for (fn in list.files(path, pattern = "_coverage_fixed_summary.bed", full.names = TRUE)){ | |
d = read.table(fn, header=T, sep="\t", comment.char = "%") | |
if (nrow(d) >0 ){ | |
d$region_pct = 100-d$region_pct | |
cov_tab = rbind(cov_tab, d) | |
} | |
} | |
ma = matrix() | |
for (fn in list.files(path, pattern = "_coverage_fixed.bed", full.names = TRUE)){ | |
if (!grepl("priority", fn)){ | |
d = read.table(fn, header=T) | |
if (nrow(d) >0 ){ | |
if (nrow(ma)>1){ | |
ma = ma + (d[,c("percentage10", "percentage20", "percentage50")] > 60) * 1 | |
}else{ | |
ma = as.matrix(d[,c("percentage10", "percentage20", "percentage50")] > 60) * 1 | |
} | |
} | |
} | |
} | |
cov_regions = cbind(d[,1:5], ma) | |
list("sample" = cov_tab, "region" = as.data.frame(cov_regions)) | |
} | |
get_total_cov = function(path){ | |
cov_tab = data.frame() | |
for (fn in list.files(path, pattern = "_total_summary.bed", full.names = TRUE)){ | |
d = read.table(fn, header=T, sep="\t") | |
if (nrow(d) >0 ){ | |
d[,1] = as.numeric(as.character(gsub("percentage", "", d$cutoff_reads))) | |
d = d[order(d[,1]),] | |
pct = d[,2] | |
t = data.frame(depth=d[,1], bases=pct, sample=d$sample[1]) | |
cov_tab = rbind(cov_tab, t) | |
} | |
} | |
cov_tab | |
} | |
make_total_cov_plots = function(cov_tab){ | |
if (nrow(cov_tab) >0 ){ | |
p =ggplot(cov_tab, aes(y=bases, x=depth, group=sample)) + | |
geom_line(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(y="% of bed file > depth", x="# of reads")) | |
print(p) | |
} | |
} | |
make_quantile_plots = function(cov_tab){ | |
if (nrow(cov_tab) >0 ){ | |
p1 = ggplot(cov_tab %>% filter(cutoff_reads=='percentage10'), aes(x=region_pct, y=bases_pct, group=sample)) + | |
geom_line(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="% of target regions with\nmore than X bases covered", y="% of nt covered\ninside the target", title="considered covered when nt has >10 reads")) | |
p2 = ggplot(cov_tab %>% filter(cutoff_reads=='percentage20'), aes(x=region_pct, y=bases_pct, group=sample)) + | |
geom_line(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="% of target regions with\nmore than X bases covered", y="% of nt covered\ninside the target", title="considered covered when nt has >20 reads")) | |
p3 = ggplot(cov_tab %>% filter(cutoff_reads=='percentage50'), aes(x=region_pct, y=bases_pct, group=sample)) + | |
geom_line(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="% of target regions with\nmore than X bases covered", y="% of nt covered\ninside the target", title="considered covered when nt has >50 reads")) | |
grid.arrange(p1, p2, p3, ncol=1) | |
} | |
} | |
make_quantile_region_plots = function(dd, n_samples){ | |
# cov_tab = melt(cov_tab, id.vars="region") | |
p1 = ggplot(dd %>% group_by(percentage10) %>% summarise(n_regions = n()) %>% filter(percentage10<n_samples*0.8), aes(x=n_regions, y=percentage10)) + | |
geom_point(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="number of regions", y="num of samples with region covered", title="considered covered when nt has >10 reads. Only looking at regions with < 80% of samples")) | |
p2 = ggplot(dd %>% group_by(percentage20) %>% summarise(n_regions = n()) %>% filter(percentage20<n_samples*0.8), aes(x=n_regions, y=percentage20)) + | |
geom_point(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="number of regions", y="num of samples with region covered", title="considered covered when nt has >10 reads. Only looking at regions with < 80% of samples")) | |
p3 = ggplot(dd %>% group_by(percentage50) %>% summarise(n_regions = n()) %>% filter(percentage50<n_samples*0.8), aes(x=n_regions, y=percentage50)) + | |
geom_point(size=2, alpha=.5)+ | |
theme_bw()+ | |
labs(list(x="number of regions", y="num of samples with region covered", title="considered covered when nt has >10 reads. Only looking at regions with < 80% of samples")) | |
grid.arrange(p1, p2, p3, ncol=1) | |
} | |
``` | |
### Coverage distribution (total) by sample | |
```{r cov-total-fig, fig.height=6, fig.width=11, cache=TRUE} | |
n_samples = nrow(qc) | |
cov_tab_total = get_total_cov(file.path(path_results, "coverage")) | |
make_total_cov_plots(cov_tab_total) | |
``` | |
### Coverage distribution (completeness) by sample | |
```{r completeness-fig, fig.height=12, fig.width=12, cache=TRUE} | |
cov_tab = get_quantile_cov(file.path(path_results, "coverage")) | |
make_quantile_plots(cov_tab$sample) | |
``` | |
Samples where more than 90% of targets have less than 60% covered at completeness cut-off of 10, 20 or 40x: | |
```{r table-completeness, results='asis'} | |
kable(cov_tab$sample %>% filter(region_pct >= 90) %>% | |
spread(cutoff_reads, bases_pct) %>% | |
dplyr::select(region_pct, sample, percentage10, percentage20, percentage40) %>% | |
dplyr::filter(percentage10<=60), | |
align="c", digits=2) | |
``` | |
```{r completeness-region-fig, fig.height=12, fig.width=12, cache=TRUE, eval=FALSE} | |
### Coverage distribution (completeness) by region | |
make_quantile_region_plots(cov_tab$region, n_samples) | |
``` | |
```{r table-completeness-regions, results='asis',eval=FALSE} | |
# Regions where less than 60% of samples covered at completeness cut-off of 10. | |
n_samples = nrow(qc) | |
kable(head(cov_tab$region %>% filter(percentage10 < n_samples*0.6),50), | |
align="c", digits=2) | |
write.table(cov_tab$region, file.path(path_results, "completeness_by_region_and_sample.tsv")) | |
``` | |
```{r cov-uniformity-load, echo=FALSE, eval=FALSE} | |
### Coverage uniformity | |
cov_tab = data.frame() | |
get_bias_cov = function(path){ | |
for (fn in list.files(path, pattern = "_cov.tsv", full.names = TRUE)){ | |
d = read.table(fn, header=T, sep="\t") | |
cv = d[,"std"]/d[,"mean"] | |
bias = (d[,"ntdow"] + d[,"ntup"])/d[,"size"] * 100 | |
s = as.character(gsub("-ready","",d[1,"sample"])) | |
t = data.frame(bias=bias, cv=cv, mean=d[,"mean"], sample=s) | |
cov_tab = rbind(cov_tab, t) | |
} | |
cov_tab | |
} | |
make_bias_plot = function(cov_tab){ | |
p1 = ggplot(cov_tab, aes(x=log2(mean), y=cv)) + | |
geom_point(alpha=0.5) + | |
scale_color_brewer(palette = "Set1") + | |
labs(list(y="coefficient of variation",x="log2(mean coverage)")) + | |
theme_bw() + | |
ggtitle("coverage variation for each target region") | |
p2 = ggplot(cov_tab, aes( x=sample,y=bias)) + | |
geom_jitter(alpha=0.5) + | |
scale_color_brewer(palette = "Set1") + | |
theme_bw() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | |
labs(list(y="% of nt with mean-2SD > coverage > mean+2SD ")) + | |
ggtitle("% of nucleotides with extreme\ncoverage inside target regions") | |
# grid.arrange(p1, p2, ncol=1) | |
print(p2) | |
} | |
``` | |
```{r cov-uniformity-fig, fig.height=12, fig.width=12, cache=TRUE, eval=FALSE, echo=FALSE} | |
bias_tab = get_bias_cov(file.path(path_results,"bias")) | |
make_bias_plot(bias_tab) | |
``` | |
### Coverage breakdown by target | |
For troubleshooting purposes generating a breakdown by region: what percentage of each targeted region is covered at >20X in a given sample? | |
```{r cov-targets} | |
# Import/concat BED coverage files | |
file_list <- list.files(path='coverage/', pattern='*_coverage_fixed.bed') | |
for (file in file_list){ | |
# if the merged dataset does exist, append to it | |
if (exists("dataset")){ | |
temp_dataset <-read.table(file.path('coverage', file), | |
header=FALSE, sep="\t", stringsAsFactors=FALSE, | |
comment.char='#', skip=1) | |
dataset<-rbind(dataset, temp_dataset) | |
rm(temp_dataset) | |
} | |
# if the merged dataset doesn't exist, create it | |
if (!exists("dataset")){ | |
dataset <- read.table(file.path('coverage', file), header=FALSE, | |
sep="\t", stringsAsFactors=FALSE, | |
comment.char='#', skip=1) | |
} | |
} | |
colnames(dataset) <- c('chrom', 'chromStart', 'chromEnd', | |
'name', 'readCount', 'meanCoverage', | |
'percentage1', 'percentage5', 'percentage10', | |
'percentage20', 'percentage40', 'percentage50', | |
'percentage60', 'percentage70', 'percentage80', | |
'percentage100', 'sampleName') | |
# Replace the name with something more readable | |
dataset$gene <- sapply(strsplit(dataset$name, ","), "[[", 1) | |
# Name of regions are not unique. Come up with new ID | |
dataset$region <- paste(dataset$chrom, | |
dataset$chromStart, | |
dataset$chromEnd, | |
dataset$gene, | |
sep='.') | |
# Re-organise into a matrix format, keeping only the | |
# 20x cutoff | |
cutoff <- dataset %>% select(region, sampleName, percentage20) %>% | |
spread(sampleName, percentage20) | |
rowlabels <- cutoff$region | |
# Remove region information and cast to numeric | |
cutoff <- cutoff[, c(2:length(colnames(cutoff)))] | |
cutoff <- as.data.frame(lapply(cutoff, as.numeric)) | |
rownames(cutoff) <- rowlabels | |
cutoff$Mean <- rowMeans(cutoff) | |
#write.csv(cutoff, file='mean50.csv') | |
datatable(cutoff, | |
rownames=TRUE) %>% | |
formatRound(c(1:length(colnames(cutoff))), 2) %>% | |
formatStyle(c(1:length(colnames(cutoff))), | |
backgroundColor = styleInterval(c(50, 70, 90), | |
c('#f03b20', | |
'#feb24c', | |
'#ffeda0', | |
'white'))) | |
``` | |
## Variant QC | |
### Coverage in variants | |
Finally, looking at global metrics for variant calls such as the number of heterozygous calls, transition/transversion ratios, and the read coverage at sites that were called as variants. | |
```{r table-variants, results='asis'} | |
if (any(grepl("Variations_heterozygous",colnames(qc)))){ | |
qc$ratio_het_hom = qc$Variations_heterozygous/qc$Variations_homozygous | |
metrics = intersect(c("sample", "Variations_total", "Variations_in_dbSNP_pct", | |
"Variations_heterozygous", "Variations_homozygous", | |
"ratio_het_hom", "Transition/Transversion"), colnames(qc)) | |
# Adjust some of the text information for formatting purposes | |
qc$Variations_in_dbSNP_pct <- as.numeric(gsub("%", "", qc$Variations_in_dbSNP_pct)) | |
qc$Variations_in_dbSNP_pct <- qc$Variations_in_dbSNP_pct / 100 | |
datatable(qc[, metrics], | |
rownames=FALSE, | |
options=list(dom = 'tp', | |
autoWidth=FALSE, | |
columnDefs = list(list(width = '600px', | |
targets = c(1), | |
className = 'dt-right', | |
targets=c(1:7))), | |
pageLength = length(qc$sample))) %>% | |
formatPercentage('Variations_in_dbSNP_pct', 1) %>% | |
formatRound('ratio_het_hom', 2) %>% | |
formatRound('Transition/Transversion', 2) | |
}else{ | |
cat("No such information available.") | |
} | |
``` | |
### Variant coverage | |
Another coverage plot, this time limited to positions identified as variants. Read coverage on the X-axis (limited to 100X), percentage of variants with that coverage on the Y-axis. The red line highlights the X=13 cutoff required for somewhat reliable heterogenous variant identification in germline samples. | |
```{r variants-coverage} | |
fns = list.files(file.path(path_results, "variants"), full.names = TRUE, pattern = "gc-depth-parse.tsv") | |
tab = data.frame() | |
for (fn in fns){ | |
dt = read.table(fn, header=T,sep="\t") | |
dt = dt %>% filter(!grepl("[::.::]",depth)) | |
dt[,2] = as.numeric(as.character(dt[,2])) | |
q = quantile(dt[,2],c(0,.10,.25,.50,.75,.90,1)) | |
labels=factor(rev(names(q)),levels=c("0%","10%","25%","50%","75%","90%","100%")) | |
dt = data.frame(variants_pct=labels, depth=q, sample=dt$sample[1]) | |
tab = rbind(tab, dt) | |
} | |
ggplot(tab, aes(x=depth, y=variants_pct, group=sample)) + | |
geom_line(size=2, alpha=.5) + | |
geom_vline(xintercept=13, color='red') + | |
theme_bw() + | |
xlim(0,100) + | |
labs(list(x="# of reads", y="% variants with more than X reads", title="variants coverage")) | |
``` | |
### Variant G/C bias | |
Checking for G/C bias for called variants. This is again capped at 100X coverage; X-axis is G/C content of a read, Y-axis the number of reads supporting a variant, colour reflects the number of variants at that coverage and G/C window. | |
```{r variants-coverage-gc, fig.width=15, fig.height=15} | |
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral'))) | |
r <- c("white", rf(9)) | |
list_p = list() | |
fns = list.files(file.path(path_results, "variants"), full.names = TRUE, pattern = "gc-depth-parse.tsv") | |
for (fn in fns){ | |
dt = read.table(fn, header=T,sep="\t", stringsAsFactors = FALSE) | |
dt = dt %>% filter(!grepl("[::.::]",depth)) | |
dt[,2] = as.numeric(as.character(dt[,2])) | |
dt[,1] = as.numeric(as.character(dt[,1])) | |
sample = dt$sample[1] | |
p = ggplot(dt, aes(CG, depth)) + | |
stat_bin2d() + | |
ylab("# of reads") + | |
scale_fill_gradientn(colours = r, guide = FALSE) + | |
theme_bw() + | |
ylim(0, quantile(dt[,2],.96)) + | |
scale_x_continuous(breaks=seq(0,100,10)) + | |
ggtitle(sample) | |
list_p[[as.character(sample)]]=p | |
} | |
do.call(grid.arrange, list_p) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment