ohofmann · December 7, 2017 02:18
diff --git a/sig.Rmd b/sig.Rmd
 ---
 title: "UMCCR Patient Summary"
 author: "Oliver Hofmann"
 date: "`r Sys.Date()`"
 output:
  html_document:
    theme: readable
    toc: false
    toc_float: false
    code_folding: hide
  rmdformats::material:
    highlight: kate    
 ---

 ```{r custom, echo=FALSE, message=FALSE, warning=FALSE}
 library(ggplot2)
 library(knitr)
 library(rmarkdown)
 library(dplyr)
 library(DT)
 library(MutationalPatterns)
 library(BSgenome)
 library(here)
 ```

 ```{r setup}
 # Set the patient identifier / name of the folder and other variables. These are equivalent
 # to the variables used in the postprocessing script. Could be passed as arguments eventually
 PATIENT <- 'WPT-013'
 TUMOR <- 'WPT-013-organoid'
 BATCH <- 'batch1'

 # Set basepath depending on patient data we want to process. Assumes standard
 # `analysis` folder structure
 patientpath <- here(PATIENT)
 ```

 ## Allelic frequencies

 Comparing the allelic frequencies of all somatic mutations vs those present in a subset of ~300 known cancer genes. Frequencies are currently based on [VarDict](https://github.com/AstraZeneca-NGS/VarDict) calls only and limited to 'high confidence' regions as determined by the [Genome in a Botte consortium].

 ```{r af}
 # Global AF
 af <- read.table(file.path(patientpath, 'af', 'af_vardict_tumor.txt'),
                header=FALSE, colClasses='numeric') %>%
  mutate(set='wgs') %>%
  select(set, af=V1) 

 # AF for AstraZeneca's AZ300 gene set
 af_az300 <- read.table(file.path(patientpath, 'af', 'af_vardict_az300.txt'),
                header=FALSE) %>%
  mutate(set='cancer_genes') %>%
  select(set, af=V6) 

 all_af <- rbind(af, af_az300)
 ggplot(all_af, aes(af)) + 
  geom_histogram(stat='bin', binwidth = 0.01) +
  facet_wrap(~set, scales='free_y')
 ```

 ## Mutational signature

 [PCGR](https://github.com/sigven/pcgr) uses [deconstructSigs](https://cran.r-project.org/web/packages/deconstructSigs/index.html) to generate somatic signatures. From our experience it can miss signatures quite frequently; using [MutationalPatterns](http://bioconductor.org/packages/release/bioc/html/MutationalPatterns.html) to doublecheck.

 ```{r importVCF}
 ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
 library(ref_genome, character.only = TRUE)

 # Grab ensemble calls from the post-processing stage. These were converted with
 #
 # awk '{ if($0 !~ /^#/) print "chr"$0; else if(match($0,/(##contig=<ID=)(.*)/,m)) 
 # print m[1]"chr"m[2]; else print $0 }' | grep -v chrG
 #
 # Also needs removal of GL.* chromosomes. Sigh..
 #
 vcf <- read_vcfs_as_granges(file.path(patientpath,
                                      'somatic',
                                      'rstudio',
                                      paste(BATCH, 'ensemble-annotated-subset-ucsf.vcf', sep='-')),
                            TUMOR, 
                            ref_genome)
 ```

 Look at somatic change distribution:

 ```{r somProfile}
 muts <- mutations_from_vcf(vcf)
 type_occurrences <- mut_type_occurrences(vcf, ref_genome)
 plot_spectrum(type_occurrences, CT=TRUE) 

 # Could do this relative to a few reference samples
 mut_mat <- mut_matrix(vcf_list=as.list(vcf), ref_genome=ref_genome)
 plot_96_profile(mut_mat)
 ```

 Extract somatic signatures and compare to the reference set:

 ```{r somSig}
 # Get Sanger sigs
 sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
                "signatures_probabilities.txt", sep = "")
 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)

 # Reorder (to make the order of the trinucleotide changes the same)
 cancer_signatures <- cancer_signatures[order(cancer_signatures[, 1]), ]

 # Only signatures in matrix
 cancer_signatures <- as.matrix(cancer_signatures[, 4:33])

 # Fit mutation matrix to cancer signatures. This function finds the optimal linear 
 # combination of mutation signatures that most closely reconstructs the
 # mutation matrix by solving a non-negative least-squares constraints problem.
 fit_res <- fit_to_signatures(mut_mat, cancer_signatures)

 # Select signatures with some contribution
 select <- which(rowSums(fit_res$contribution) > 0)

 # plot contribution
 result <- as.data.frame(fit_res$contribution[select, ]) %>%
  mutate(Contribution=fit_res$contribution[select, ]) %>%
  select(Contribution) %>%
  arrange(-Contribution)

 # Quick summary of the results; no plotting yet
 datatable(result) %>%
  formatRound('Contribution', 1)
 ```

 Check for positional enrichment of somatic signatures (limited to autosomes):


 ```{r rainfall}
 chromosomes <- seqnames(get(ref_genome))[1:22]
 plot_rainfall(vcf[[1]], chromosomes=chromosomes, cex = 1.5, ylim = 1e+09 )
 ```

 ## Addendum

 ```{r sessioninfo}
 devtools::session_info()
 ```
	---
	title: "UMCCR Patient Summary"
	author: "Oliver Hofmann"
	date: "`r Sys.Date()`"
	output:
	html_document:
	theme: readable
	toc: false
	toc_float: false
	code_folding: hide
	rmdformats::material:
	highlight: kate
	---

	```{r custom, echo=FALSE, message=FALSE, warning=FALSE}
	library(ggplot2)
	library(knitr)
	library(rmarkdown)
	library(dplyr)
	library(DT)
	library(MutationalPatterns)
	library(BSgenome)
	library(here)
	```

	```{r setup}
	# Set the patient identifier / name of the folder and other variables. These are equivalent
	# to the variables used in the postprocessing script. Could be passed as arguments eventually
	PATIENT <- 'WPT-013'
	TUMOR <- 'WPT-013-organoid'
	BATCH <- 'batch1'

	# Set basepath depending on patient data we want to process. Assumes standard
	# `analysis` folder structure
	patientpath <- here(PATIENT)
	```

	## Allelic frequencies

	Comparing the allelic frequencies of all somatic mutations vs those present in a subset of ~300 known cancer genes. Frequencies are currently based on [VarDict](https://github.com/AstraZeneca-NGS/VarDict) calls only and limited to 'high confidence' regions as determined by the [Genome in a Botte consortium].

	```{r af}
	# Global AF
	af <- read.table(file.path(patientpath, 'af', 'af_vardict_tumor.txt'),
	header=FALSE, colClasses='numeric') %>%
	mutate(set='wgs') %>%
	select(set, af=V1)

	# AF for AstraZeneca's AZ300 gene set
	af_az300 <- read.table(file.path(patientpath, 'af', 'af_vardict_az300.txt'),
	header=FALSE) %>%
	mutate(set='cancer_genes') %>%
	select(set, af=V6)

	all_af <- rbind(af, af_az300)
	ggplot(all_af, aes(af)) +
	geom_histogram(stat='bin', binwidth = 0.01) +
	facet_wrap(~set, scales='free_y')
	```

	## Mutational signature

	[PCGR](https://github.com/sigven/pcgr) uses [deconstructSigs](https://cran.r-project.org/web/packages/deconstructSigs/index.html) to generate somatic signatures. From our experience it can miss signatures quite frequently; using [MutationalPatterns](http://bioconductor.org/packages/release/bioc/html/MutationalPatterns.html) to doublecheck.

	```{r importVCF}
	ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
	library(ref_genome, character.only = TRUE)

	# Grab ensemble calls from the post-processing stage. These were converted with
	#
	# awk '{ if($0 !~ /^#/) print "chr"$0; else if(match($0,/(##contig=<ID=)(.*)/,m))
	# print m[1]"chr"m[2]; else print $0 }' \| grep -v chrG
	#
	# Also needs removal of GL.* chromosomes. Sigh..
	#
	vcf <- read_vcfs_as_granges(file.path(patientpath,
	'somatic',
	'rstudio',
	paste(BATCH, 'ensemble-annotated-subset-ucsf.vcf', sep='-')),
	TUMOR,
	ref_genome)
	```

	Look at somatic change distribution:

	```{r somProfile}
	muts <- mutations_from_vcf(vcf)
	type_occurrences <- mut_type_occurrences(vcf, ref_genome)
	plot_spectrum(type_occurrences, CT=TRUE)

	# Could do this relative to a few reference samples
	mut_mat <- mut_matrix(vcf_list=as.list(vcf), ref_genome=ref_genome)
	plot_96_profile(mut_mat)
	```

	Extract somatic signatures and compare to the reference set:

	```{r somSig}
	# Get Sanger sigs
	sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
	"signatures_probabilities.txt", sep = "")
	cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)

	# Reorder (to make the order of the trinucleotide changes the same)
	cancer_signatures <- cancer_signatures[order(cancer_signatures[, 1]), ]

	# Only signatures in matrix
	cancer_signatures <- as.matrix(cancer_signatures[, 4:33])

	# Fit mutation matrix to cancer signatures. This function finds the optimal linear
	# combination of mutation signatures that most closely reconstructs the
	# mutation matrix by solving a non-negative least-squares constraints problem.
	fit_res <- fit_to_signatures(mut_mat, cancer_signatures)

	# Select signatures with some contribution
	select <- which(rowSums(fit_res$contribution) > 0)

	# plot contribution
	result <- as.data.frame(fit_res$contribution[select, ]) %>%
	mutate(Contribution=fit_res$contribution[select, ]) %>%
	select(Contribution) %>%
	arrange(-Contribution)

	# Quick summary of the results; no plotting yet
	datatable(result) %>%
	formatRound('Contribution', 1)
	```

	Check for positional enrichment of somatic signatures (limited to autosomes):


	```{r rainfall}
	chromosomes <- seqnames(get(ref_genome))[1:22]
	plot_rainfall(vcf[[1]], chromosomes=chromosomes, cex = 1.5, ylim = 1e+09 )
	```

	## Addendum

	```{r sessioninfo}
	devtools::session_info()
	```