Created
April 17, 2019 09:09
-
-
Save SaskiaFreytag/f0f6904ec1f6e754a655d9fb1d17cab2 to your computer and use it in GitHub Desktop.
Code Review
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# outline the packages needed for this script | |
# pak::pkg_install("hadley/requirements") | |
# requirements::req_file("SNP_saturation.R") | |
# Input 1-2 sentences that describe what this does, when it was done, and why | |
if (!requireNamespace("parallel", quietly = TRUE)) { | |
install.packages("parallel") | |
} | |
if (!requireNamespace("vcfR", quietly = TRUE)) { | |
install.packages("vcfR") | |
} | |
if (!requireNamespace("dplyr", quietly = TRUE)) { | |
install.packages("dplyr") | |
} | |
# State the libraries up front | |
library(parallel) | |
library(dplyr) | |
library(cvfR) | |
# Do not output scientific notation | |
options(scipen = 999) | |
#' Why are you running gc at the start? | |
# Initial garbage collection | |
gc() | |
# Set command line arguments | |
# (why? The output is character(0)? or is this not supposed to be run interactively?) | |
args <- commandArgs(TRUE) | |
bam <- args[1] | |
#' tools::file_ext("wat.bam") I think does the same thing? | |
sample_bam <- gsub(".bam", "", bam) | |
extr_seq <- as.numeric(args[2]) | |
outFile <- args[3] | |
#' Jenny Bryan will set your computer on fire! | |
genome <- | |
"/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa" | |
# Prepare files | |
# First subset to only chromosome X | |
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam")) | |
#' why are you using `system` on paste? | |
# Remove additional tags on barcodes | |
barcodes <- | |
read.table(paste0(sample_bam , "_barcodes.tsv"), | |
header = F, | |
sep = "\t") | |
# find and replace -1 with 0? | |
barcodes$V1 <- gsub("-1", "", barcodes$V1) | |
# is "aa" a good variable name? | |
aa <- length(barcodes$V1) | |
write.table( | |
barcodes, | |
file = paste0(sample_bam, "_barcodes_corr.tsv"), | |
sep = "\t", | |
quote = F, | |
col.names = F, | |
row.names = F | |
) | |
# Make vcf of SNPs | |
#system(paste0("source activate samtools | |
# samtools mpileup -f ", genome , " -g ", sample_bam, | |
# "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf | |
# source deactivate")) | |
# Subset to heterozygous SNPs | |
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf')) | |
# Reattach header | |
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf')) | |
## Function for one run | |
one_run <- function(i, per_i) { | |
library(vcfR) # put library calls outside of the function | |
library(dplyr) # ^^ | |
message(paste0("Iteration: ", i)) | |
# Prevent all parallel runs from having the same starting seed | |
Sys.sleep(i) | |
# Subsample the bam file at percentage and then index | |
system( | |
paste0( | |
"source activate samtools | |
samtools view -s ", | |
per_i + i , | |
" -b ", | |
sample_bam, | |
"_chrX.bam > sub_sampled_tmp_", | |
i, | |
".bam | |
source deactivate" | |
) | |
) | |
system(paste0("samtools index sub_sampled_tmp_", i, ".bam")) | |
# Apply CellSNP to subsampled file | |
system( | |
paste0( | |
"source activate CellSNP | |
cellSNP -s sub_sampled_tmp_", | |
i, | |
".bam -b ", | |
sample_bam, | |
"_barcodes_corr.tsv", | |
" -o sub_sampled_tmp_", | |
i, | |
".snp -R ", | |
sample_bam, | |
"_chrX.het.corr.vcf -p 20 | |
source deactivate" | |
) | |
) | |
# Count the number of SNPs and cells with SNPs | |
vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), | |
verbose = FALSE) | |
number_snps <- dim(vcf@fix)[1] | |
vcf_tibble <- vcfR2tidy(vcf, | |
format_fields = c("DP", "AD", "OTH", "ALL")) | |
vcf_tibble_narm <- vcf_tibble$gt %>% | |
mutate(present = gt_DP >= 0) %>% | |
group_by(Indiv) %>% | |
summarize(n_alleles = sum(present, na.rm = TRUE)) | |
number_cells_1_snps <- dim(vcf_tibble_narm %>% | |
ungroup() %>% | |
filter(n_alleles >= 1))[1] | |
number_cells_2_snps <- dim(vcf_tibble_narm %>% | |
ungroup() %>% | |
filter(n_alleles >= 2))[1] | |
# Remove temporary files | |
system(paste0("rm *_tmp_", i, "*")) | |
return( | |
list( | |
number_snps = number_snps, | |
number_cells_1_snps = number_cells_1_snps, | |
number_cells_2_snps = number_cells_2_snps | |
) | |
) | |
} # end of function | |
#' This function should be in a separate script that can be "source"'d. | |
## All percentages to be investigated | |
per <- seq(0.05, 0.95, 0.05) | |
## Initialize objects to save | |
#' this is a dangerous pattern - should try and specify the size of the list | |
res <- list() | |
for (i in 1:length(per)) { | |
per_tmp <- per[i] | |
cl <- makeCluster(5) | |
clusterExport(cl, c("per_tmp", "sample_bam", "one_run")) | |
res[[i]] <- parLapply(cl, 1:5, | |
function(x){ | |
one_run(x, per_tmp) | |
} | |
) # make your function clearer | |
stopCluster(cl) | |
} | |
saveRDS(res, file = paste0("Results_", outFile, ".rds")) | |
# Make a dataframe from results averaged over 5 runs | |
res1 <- | |
data.frame( | |
number_snps = sapply(res, | |
function(x) mean(sapply(x, | |
function(y) y[[1]]))), | |
number_cells_1_snp = sapply(res, | |
function(x) mean(sapply(x, | |
function(y) y[[2]]))), | |
number_cells_2_snps = sapply(res, | |
function(x) mean(sapply(x, | |
function(y) y[[3]]))), | |
per = per | |
) | |
# Make a dataframe from results | |
res2 <- | |
data.frame( | |
number_snps = c(sapply(res, function(x) sapply(x, | |
function(y) y[[1]]))), | |
number_cells_1_snp = c(sapply(res, | |
function(x) sapply(x, | |
function(y) y[[2]]))), | |
number_cells_2_snps = c(sapply(res, | |
function(x) sapply(x, | |
function(y) y[[3]]))), | |
per = rep(per, each = 5) | |
) | |
# Polynomial function | |
f <- function(x, a, b, n) { | |
(a * x ^ n) / (x ^ n + b) | |
} | |
# Fit polynomial function for number of cells with at least one SNP | |
inital_value_n <- | |
lm(log(number_cells_1_snp + 1) ~ log(per), | |
res2)$coefficients[2] | |
fit <- | |
nls( | |
number_cells_1_snp ~ f(per, a, b, n), | |
data = res2, | |
start = list(a = 4634, | |
b = 1, | |
n = inital_value_n) | |
) | |
# Fit polynomial function for number of cells with at least two SNPs | |
inital_value_n_1 <- | |
lm(log(number_cells_1_snp + 1) ~ log(per), | |
res2)$coefficients[2] | |
fit1 <- | |
nls( | |
number_cells_2_snps ~ f(per, a, b, n), | |
data = res2, | |
start = list(a = 4634, | |
b = 1, | |
n = inital_value_n_1) | |
) | |
# Fit polynomial function for number of SNPs | |
inital_value_n_0 <- | |
lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2] | |
#' woh - why are are you using try here? | |
try(fit0 <- | |
nls( | |
number_snps ~ f(per, a, b, n), | |
data = res2, | |
start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0) | |
)) | |
# Find range for extrapolation | |
extr_end <- 1 + extr_seq + 0.5 | |
per_extr <- seq(0.05, extr_end, 0.01) | |
# Make dataset for extrapolation | |
res_extr <- data.frame( | |
per = per_extr, | |
number_cells_1_snp = sapply(per_extr, | |
function(x){ f(x, | |
coef(fit)[1], | |
coef(fit)[2], | |
coef(fit)[3]) | |
} | |
), | |
number_cells_2_snps = sapply(per_extr, | |
function(x){ | |
f(x, | |
coef(fit1)[1], | |
coef(fit1)[2], | |
coef(fit1)[3]) | |
} | |
) | |
) | |
try(res_extr$number_snps <- | |
sapply(per_extr, | |
function(x){ | |
f(x, | |
coef(fit0)[1], | |
coef(fit0)[2], | |
coef(fit0)[3]) | |
} | |
) | |
) | |
# Plot results and output them | |
png(paste0(outFile, "_cells_1_snp.png")) | |
plot(res_extr$per, | |
res_extr$number_cells_1_snp, | |
xlab = "Percentage*100", | |
ylab = "Number of cells with 1 SNP") | |
points(res1$per, | |
res1$number_cells_1_snp, | |
col = "red", | |
pch = 19) | |
abline(v = extr_seq + 1) | |
txt.x <- extr_end / 2 | |
txt.y <- max(res_extr$number_cells_1_snp) / 2 | |
text(txt.x, | |
txt.y, | |
labels = paste( | |
"With", | |
round(extr_seq, 2), | |
" more reads: \n", | |
round(f( | |
1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3] | |
), 0) , | |
"cells detected" | |
)) | |
dev.off() | |
png(paste0(outFile, "_cells_2_snps.png")) | |
plot(res_extr$per, | |
res_extr$number_cells_2_snps, | |
xlab = "Percentage*100", | |
ylab = "Number of cells with 2 SNPs") | |
points(res1$per, | |
res1$number_cells_2_snps, | |
col = "red", | |
pch = 19) | |
abline(v = extr_seq + 1) | |
txt.x <- extr_end / 2 | |
txt.y <- max(res_extr$number_cells_2_snps) / 2 | |
text(txt.x, | |
txt.y, | |
labels = paste( | |
"With", | |
round(extr_seq, 2), | |
" more reads: \n", | |
round(f( | |
1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3] | |
), 0) , | |
"cells detected" | |
)) | |
dev.off() | |
try({ | |
png(paste0(outFile, "_snps.png")) | |
plot(res_extr$per, | |
res_extr$number_snps, | |
xlab = "Percentage*100", | |
ylab = "Number ofSNPs") | |
points(res1$per, res1$number_snps, col = "red", pch = 19) | |
abline(v = extr_seq + 1) | |
txt.x <- extr_end / 2 | |
txt.y <- 100 | |
text(txt.x, | |
txt.y, | |
labels = paste( | |
"With", | |
round(extr_seq, 2), | |
" more reads: \n", | |
round(f( | |
1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3] | |
), 0) , | |
"SNPs detected" | |
)) | |
dev.off() | |
} | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(parallel) | |
# Do not output scientific notation | |
options(scipen=999) | |
# Initial garbage collection | |
gc() | |
# Set command line arguments | |
args <- commandArgs(TRUE) | |
bam <- args[1] | |
sample_bam <- gsub(".bam", "", bam) | |
extr_seq <- as.numeric(args[2]) | |
outFile <- args[3] | |
genome = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa" | |
# Prepare files | |
# First subset to only chromosome X | |
#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam")) | |
# Remove additional tags on barcodes | |
barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t") | |
barcodes$V1 <- gsub("-1", "", barcodes$V1) | |
aa <- length(barcodes$V1) | |
write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t", | |
quote=F, col.names=F, row.names=F) | |
# Make vcf of SNPs | |
#system(paste0("source activate samtools | |
# samtools mpileup -f ", genome , " -g ", sample_bam, | |
# "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf | |
# source deactivate")) | |
# Subset to heterozygous SNPs | |
#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf')) | |
# Reattach header | |
#system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf')) | |
## Function for one run | |
one_run <- function(i, per_i) { | |
library(vcfR) | |
library(dplyr) | |
print(paste0("Iteration: ", i)) | |
# Prevent all parallel runs from having the same starting seed | |
Sys.sleep(i) | |
# Subsample the bam file at percentage and then index | |
system(paste0("source activate samtools | |
samtools view -s ", per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam | |
source deactivate")) | |
system(paste0("samtools index sub_sampled_tmp_", i, ".bam")) | |
# Apply CellSNP to subsampled file | |
system(paste0("source activate CellSNP | |
cellSNP -s sub_sampled_tmp_", i, ".bam -b ", | |
sample_bam, "_barcodes_corr.tsv", | |
" -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20 | |
source deactivate")) | |
# Count the number of SNPs and cells with SNPs | |
vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE) | |
number_snps <- dim(vcf@fix) [1] | |
vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL")) | |
vcf_tibble_narm <- vcf_tibble$gt %>% | |
mutate(present=gt_DP>=0) %>% group_by(Indiv) %>% | |
summarize(n_alleles=sum(present, na.rm=TRUE)) | |
number_cells_1_snps <- dim(vcf_tibble_narm %>% | |
ungroup() %>% | |
filter(n_alleles>=1))[1] | |
number_cells_2_snps <- dim(vcf_tibble_narm %>% | |
ungroup() %>% | |
filter(n_alleles>=2))[1] | |
# Remove temporary files | |
system(paste0("rm *_tmp_", i, "*")) | |
return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps, | |
number_cells_2_snps=number_cells_2_snps)) | |
} | |
## All percentages to be investigated | |
per <- seq(0.05,0.95,0.05) | |
## Initialize objects to save | |
res <- list() | |
for (i in 1:length(per)){ | |
per_tmp <- per[i] | |
cl <- makeCluster(5) | |
clusterExport(cl, c("per_tmp", "sample_bam", "one_run")) | |
res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp)) | |
stopCluster(cl) | |
} | |
saveRDS(res, file=paste0("Results_", outFile, ".rds")) | |
# Make a dataframe from results averaged over 5 runs | |
res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))), | |
number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))), | |
number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))), | |
per = per) | |
# Make a dataframe from results | |
res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))), | |
number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))), | |
number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))), | |
per = rep(per, each=5)) | |
# Polynomial function | |
f <- function(x, a, b, n) { | |
(a*x^n)/ (x^n+b) | |
} | |
# Fit polynomial function for number of cells with at least one SNP | |
inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2] | |
fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n)) | |
# Fit polynomial function for number of cells with at least two SNPs | |
inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2] | |
fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1)) | |
# Fit polynomial function for number of SNPs | |
inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2] | |
try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0))) | |
# Find range for extrapolation | |
extr_end <- 1 + extr_seq + 0.5 | |
per_extr <- seq(0.05, extr_end, 0.01) | |
# Make dataset for extrapolation | |
res_extr <- data.frame(per=per_extr, | |
number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])), | |
number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]))) | |
try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]))) | |
# Plot results and output them | |
png(paste0(outFile, "_cells_1_snp.png")) | |
plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP") | |
points(res1$per, res1$number_cells_1_snp, col="red", pch=19) | |
abline(v=extr_seq+1) | |
txt.x <- extr_end/2 | |
txt.y <- max(res_extr$number_cells_1_snp)/2 | |
text(txt.x, txt.y, | |
labels=paste("With", round(extr_seq,2), " more reads: \n", | |
round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected")) | |
dev.off() | |
png(paste0(outFile, "_cells_2_snps.png")) | |
plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs") | |
points(res1$per, res1$number_cells_2_snps, col="red", pch=19) | |
abline(v=extr_seq+1) | |
txt.x <- extr_end/2 | |
txt.y <- max(res_extr$number_cells_2_snps)/2 | |
text(txt.x, txt.y, | |
labels=paste("With", round(extr_seq,2), " more reads: \n", | |
round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected")) | |
dev.off() | |
try({png(paste0(outFile, "_snps.png")) | |
plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs") | |
points(res1$per, res1$number_snps, col="red", pch=19) | |
abline(v=extr_seq+1) | |
txt.x <- extr_end/2 | |
txt.y <- 100 | |
text(txt.x, txt.y, | |
labels=paste("With", round(extr_seq,2), " more reads: \n", | |
round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected")) | |
dev.off()}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment