SaskiaFreytag · April 17, 2019 09:09
diff --git a/after_code_review b/after_code_review
 # outline the packages needed for this script
 # pak::pkg_install("hadley/requirements")
 # requirements::req_file("SNP_saturation.R")

 # Input 1-2 sentences that describe what this does, when it was done, and why

 if (!requireNamespace("parallel", quietly = TRUE)) {
  install.packages("parallel")
 }

 if (!requireNamespace("vcfR", quietly = TRUE)) {
  install.packages("vcfR")
 }

 if (!requireNamespace("dplyr", quietly = TRUE)) {
  install.packages("dplyr")
 }

 # State the libraries up front
 library(parallel)
 library(dplyr)
 library(cvfR)

 # Do not output scientific notation
 options(scipen = 999)

 #' Why are you running gc at the start?
 # Initial garbage collection
 gc()

 # Set command line arguments
 # (why? The output is character(0)? or is this not supposed to be run interactively?)
 args <- commandArgs(TRUE)

 bam <- args[1]
 #' tools::file_ext("wat.bam") I think does the same thing?
 sample_bam <- gsub(".bam", "", bam)
 extr_seq <- as.numeric(args[2])
 outFile <- args[3]

 #' Jenny Bryan will set your computer on fire!
 genome  <-
  "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

 # Prepare files
 # First subset to only chromosome X
 #system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
 #' why are you using `system` on paste?

 # Remove additional tags on barcodes
 barcodes <-
  read.table(paste0(sample_bam , "_barcodes.tsv"),
             header = F,
             sep = "\t")

 # find and replace -1 with 0?
 barcodes$V1 <- gsub("-1", "", barcodes$V1)

 # is "aa" a good variable name?
 aa <- length(barcodes$V1)
 write.table(
  barcodes,
  file = paste0(sample_bam, "_barcodes_corr.tsv"),
  sep = "\t",
  quote = F,
  col.names = F,
  row.names = F
 )

 # Make vcf of SNPs
 #system(paste0("source activate samtools
 #              samtools mpileup -f ", genome , " -g ", sample_bam,
 #              "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
 #              source deactivate"))

 # Subset to heterozygous SNPs
 #system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
 # Reattach header
 #system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

 ## Function for one run

 one_run <- function(i, per_i) {
  library(vcfR) # put library calls outside of the function
  library(dplyr) # ^^
  
  message(paste0("Iteration: ", i))
  
  # Prevent all parallel runs from having the same starting seed
  Sys.sleep(i)
  
  # Subsample the bam file at percentage and then index
  system(
    paste0(
      "source activate samtools
                samtools view -s ",
      per_i + i ,
      " -b ",
      sample_bam,
      "_chrX.bam > sub_sampled_tmp_",
      i,
      ".bam
                source deactivate"
    )
  )
  system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))
  
  # Apply CellSNP to subsampled file
  system(
    paste0(
      "source activate CellSNP
              cellSNP -s sub_sampled_tmp_",
      i,
      ".bam -b ",
      sample_bam,
      "_barcodes_corr.tsv",
      " -o sub_sampled_tmp_",
      i,
      ".snp -R ",
      sample_bam,
      "_chrX.het.corr.vcf -p 20
              source deactivate"
    )
  )
  
  # Count the number of SNPs and cells with SNPs
  vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), 
                   verbose = FALSE)
  
  number_snps <- dim(vcf@fix)[1]
  
  vcf_tibble <- vcfR2tidy(vcf, 
                          format_fields = c("DP", "AD", "OTH", "ALL"))
  
  vcf_tibble_narm <- vcf_tibble$gt %>%
    mutate(present = gt_DP >= 0) %>% 
    group_by(Indiv) %>%
    summarize(n_alleles = sum(present, na.rm = TRUE))
  
  number_cells_1_snps <- dim(vcf_tibble_narm %>%
                               ungroup() %>%
                               filter(n_alleles >= 1))[1]
  
  number_cells_2_snps <- dim(vcf_tibble_narm %>%
                               ungroup() %>%
                               filter(n_alleles >= 2))[1]
  
  # Remove temporary files
  system(paste0("rm *_tmp_", i, "*"))
  
  return(
    list(
      number_snps = number_snps,
      number_cells_1_snps = number_cells_1_snps,
      number_cells_2_snps = number_cells_2_snps
    )
  )
  
 } # end of function
 #' This function should be in a separate script that can be "source"'d.

 ## All percentages to be investigated
 per <- seq(0.05, 0.95, 0.05)

 ## Initialize objects to save
 #' this is a dangerous pattern - should try and specify the size of the list
 res <- list()

 for (i in 1:length(per)) {
  per_tmp <- per[i]
  
  cl <- makeCluster(5)
  clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
  res[[i]] <- parLapply(cl, 1:5, 
                        function(x){
                          one_run(x, per_tmp)
                        }
  ) # make your function clearer
  
  stopCluster(cl)
  
 }

 saveRDS(res, file = paste0("Results_", outFile, ".rds"))

 # Make a dataframe from results averaged over 5 runs
 res1 <-
  data.frame(
    number_snps = sapply(res, 
                         function(x) mean(sapply(x, 
                                                 function(y) y[[1]]))),
    number_cells_1_snp = sapply(res, 
                                function(x) mean(sapply(x, 
                                                        function(y) y[[2]]))),
    number_cells_2_snps = sapply(res, 
                                 function(x) mean(sapply(x, 
                                                         function(y) y[[3]]))),
    per = per
  )

 # Make a dataframe from results
 res2 <-
  data.frame(
    number_snps = c(sapply(res, function(x) sapply(x, 
                                                   function(y) y[[1]]))),
    number_cells_1_snp = c(sapply(res, 
                                  function(x) sapply(x, 
                                                     function(y) y[[2]]))),
    number_cells_2_snps = c(sapply(res, 
                                   function(x) sapply(x, 
                                                      function(y) y[[3]]))),
    per = rep(per, each = 5)
  )

 # Polynomial function
 f <- function(x, a, b, n) {
  (a * x ^ n) / (x ^ n + b)
 }

 # Fit polynomial function for number of cells with at least one SNP
 inital_value_n <-
  lm(log(number_cells_1_snp + 1) ~ log(per), 
     res2)$coefficients[2]

 fit <-
  nls(
    number_cells_1_snp ~ f(per, a, b, n),
    data = res2,
    start = list(a = 4634, 
                 b = 1, 
                 n = inital_value_n)
  )

 # Fit polynomial function for number of cells with at least two SNPs
 inital_value_n_1 <-
  lm(log(number_cells_1_snp + 1) ~ log(per), 
     res2)$coefficients[2]

 fit1 <-
  nls(
    number_cells_2_snps ~ f(per, a, b, n),
    data = res2,
    start = list(a = 4634, 
                 b = 1, 
                 n = inital_value_n_1)
  )

 # Fit polynomial function for number of SNPs
 inital_value_n_0 <-
  lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2]

 #' woh - why are are you using try here?
 try(fit0 <-
      nls(
        number_snps ~ f(per, a, b, n),
        data = res2,
        start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0)
      ))

 # Find range for extrapolation
 extr_end <- 1 + extr_seq + 0.5
 per_extr <- seq(0.05, extr_end, 0.01)

 # Make dataset for extrapolation
 res_extr <- data.frame(
  per = per_extr,
  number_cells_1_snp = sapply(per_extr, 
                              function(x){ f(x, 
                                            coef(fit)[1], 
                                            coef(fit)[2], 
                                            coef(fit)[3])
                                }
                              ),
  number_cells_2_snps = sapply(per_extr, 
                               function(x){
                                 f(x, 
                                   coef(fit1)[1], 
                                   coef(fit1)[2], 
                                   coef(fit1)[3])
                                 }
                               )
 )

 try(res_extr$number_snps <-
      sapply(per_extr, 
             function(x){
               f(x, 
                 coef(fit0)[1], 
                 coef(fit0)[2], 
                 coef(fit0)[3])
               }
             )
    )

 # Plot results and output them
 png(paste0(outFile, "_cells_1_snp.png"))

 plot(res_extr$per,
     res_extr$number_cells_1_snp,
     xlab = "Percentage*100",
     ylab = "Number of cells with 1 SNP")
 points(res1$per,
       res1$number_cells_1_snp,
       col = "red",
       pch = 19)
 abline(v = extr_seq + 1)
 txt.x <- extr_end / 2
 txt.y <- max(res_extr$number_cells_1_snp) / 2
 text(txt.x,
     txt.y,
     labels = paste(
       "With",
       round(extr_seq, 2),
       " more reads: \n",
       round(f(
         1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]
       ), 0) ,
       "cells detected"
     ))
 dev.off()

 png(paste0(outFile, "_cells_2_snps.png"))

 plot(res_extr$per,
     res_extr$number_cells_2_snps,
     xlab = "Percentage*100",
     ylab = "Number of cells with 2 SNPs")
 points(res1$per,
       res1$number_cells_2_snps,
       col = "red",
       pch = 19)
 abline(v = extr_seq + 1)
 txt.x <- extr_end / 2
 txt.y <- max(res_extr$number_cells_2_snps) / 2
 text(txt.x,
     txt.y,
     labels = paste(
       "With",
       round(extr_seq, 2),
       " more reads: \n",
       round(f(
         1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]
       ), 0) ,
       "cells detected"
     ))
 dev.off()

 try({
  png(paste0(outFile, "_snps.png"))
  plot(res_extr$per,
       res_extr$number_snps,
       xlab = "Percentage*100",
       ylab = "Number ofSNPs")
  points(res1$per, res1$number_snps, col = "red", pch = 19)
  abline(v = extr_seq + 1)
  txt.x <- extr_end / 2
  txt.y <- 100
  text(txt.x,
       txt.y,
       labels = paste(
         "With",
         round(extr_seq, 2),
         " more reads: \n",
         round(f(
           1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]
         ), 0) ,
         "SNPs detected"
       ))
  dev.off()
 }
 )
diff --git a/before_code_review b/before_code_review
 library(parallel)

 # Do not output scientific notation
 options(scipen=999)

 # Initial garbage collection
 gc()

 # Set command line arguments
 args <- commandArgs(TRUE)

 bam <- args[1]
 sample_bam <- gsub(".bam", "", bam)
 extr_seq <- as.numeric(args[2])
 outFile <- args[3]
 genome  = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

 # Prepare files
 # First subset to only chromosome X
 #system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))

 # Remove additional tags on barcodes
 barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t")
 barcodes$V1 <- gsub("-1", "", barcodes$V1)
 aa <- length(barcodes$V1)
 write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t",
            quote=F, col.names=F, row.names=F)

 # Make vcf of SNPs 
 #system(paste0("source activate samtools 
 #              samtools mpileup -f ", genome , " -g ", sample_bam, 
 #              "_chrX.bam | bcftools call -m > ", sample_bam, "_chrX.vcf
 #              source deactivate"))

 # Subset to heterozygous SNPs
 #system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
 # Reattach header
 #system(paste0('grep "#" ', sample_bam, '_chrX.vcf | cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

 ## Function for one run 

 one_run <- function(i, per_i) {
  
  library(vcfR)
  library(dplyr)
  
  print(paste0("Iteration: ", i))
  
  # Prevent all parallel runs from having the same starting seed
  Sys.sleep(i)
  
  # Subsample the bam file at percentage and then index
  system(paste0("source activate samtools
                samtools view -s ",  per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam
                source deactivate"))
  system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

  # Apply CellSNP to subsampled file 
  system(paste0("source activate CellSNP
              cellSNP -s sub_sampled_tmp_", i, ".bam -b ",
              sample_bam, "_barcodes_corr.tsv",
              " -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20 
              source deactivate"))

  # Count the number of SNPs and cells with SNPs
  vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE)
  number_snps <- dim(vcf@fix) [1]
  vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL"))
  vcf_tibble_narm <- vcf_tibble$gt %>% 
                     mutate(present=gt_DP>=0) %>% group_by(Indiv) %>% 
                     summarize(n_alleles=sum(present, na.rm=TRUE))
  number_cells_1_snps <- dim(vcf_tibble_narm %>%
                         ungroup() %>%
                         filter(n_alleles>=1))[1]
  number_cells_2_snps <- dim(vcf_tibble_narm %>%
                         ungroup() %>%
                         filter(n_alleles>=2))[1]
  
  # Remove temporary files
  system(paste0("rm *_tmp_", i, "*"))
  
  return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps,
              number_cells_2_snps=number_cells_2_snps))

 }

 ## All percentages to be investigated
 per <- seq(0.05,0.95,0.05)

 ## Initialize objects to save 
 res <- list()

 for (i in 1:length(per)){
  
  per_tmp <- per[i]
  
  cl <- makeCluster(5)
  clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
  res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp))
  stopCluster(cl)

 }

 saveRDS(res, file=paste0("Results_", outFile, ".rds"))

 # Make a dataframe from results averaged over 5 runs
 res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))),
                   number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))),
                   number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))),
                   per = per)

 # Make a dataframe from results 
 res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))),
                   number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))),
                   number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))),
                   per = rep(per, each=5))

 # Polynomial function
 f <- function(x, a, b, n) {
  (a*x^n)/ (x^n+b)
 }

 # Fit polynomial function for number of cells with at least one SNP
 inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
 fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n))

 # Fit polynomial function for number of cells with at least two SNPs
 inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
 fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1))

 # Fit polynomial function for number of SNPs
 inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
 try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0)))

 # Find range for extrapolation
 extr_end <- 1 + extr_seq + 0.5
 per_extr <- seq(0.05, extr_end, 0.01)

 # Make dataset for extrapolation
 res_extr <- data.frame(per=per_extr,
                       number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])),
                       number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3])))

 try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3])))

 # Plot results and output them
 png(paste0(outFile, "_cells_1_snp.png"))
  plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP")
  points(res1$per, res1$number_cells_1_snp, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- max(res_extr$number_cells_1_snp)/2
  text(txt.x, txt.y,
       labels=paste("With", round(extr_seq,2), " more reads: \n", 
                  round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected"))
 dev.off()

 png(paste0(outFile, "_cells_2_snps.png"))
  plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs")
  points(res1$per, res1$number_cells_2_snps, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- max(res_extr$number_cells_2_snps)/2
  text(txt.x, txt.y,
      labels=paste("With", round(extr_seq,2), " more reads: \n", 
                  round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected"))
 dev.off()

 try({png(paste0(outFile, "_snps.png"))
  plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs")
  points(res1$per, res1$number_snps, col="red", pch=19)
  abline(v=extr_seq+1)
  txt.x <- extr_end/2
  txt.y <- 100
  text(txt.x, txt.y,
       labels=paste("With", round(extr_seq,2), " more reads: \n", 
                  round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected"))
 dev.off()})
	# outline the packages needed for this script
	# pak::pkg_install("hadley/requirements")
	# requirements::req_file("SNP_saturation.R")

	# Input 1-2 sentences that describe what this does, when it was done, and why

	if (!requireNamespace("parallel", quietly = TRUE)) {
	install.packages("parallel")
	}

	if (!requireNamespace("vcfR", quietly = TRUE)) {
	install.packages("vcfR")
	}

	if (!requireNamespace("dplyr", quietly = TRUE)) {
	install.packages("dplyr")
	}

	# State the libraries up front
	library(parallel)
	library(dplyr)
	library(cvfR)

	# Do not output scientific notation
	options(scipen = 999)

	#' Why are you running gc at the start?
	# Initial garbage collection
	gc()

	# Set command line arguments
	# (why? The output is character(0)? or is this not supposed to be run interactively?)
	args <- commandArgs(TRUE)

	bam <- args[1]
	#' tools::file_ext("wat.bam") I think does the same thing?
	sample_bam <- gsub(".bam", "", bam)
	extr_seq <- as.numeric(args[2])
	outFile <- args[3]

	#' Jenny Bryan will set your computer on fire!
	genome <-
	"/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

	# Prepare files
	# First subset to only chromosome X
	#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))
	#' why are you using `system` on paste?

	# Remove additional tags on barcodes
	barcodes <-
	read.table(paste0(sample_bam , "_barcodes.tsv"),
	header = F,
	sep = "\t")

	# find and replace -1 with 0?
	barcodes$V1 <- gsub("-1", "", barcodes$V1)

	# is "aa" a good variable name?
	aa <- length(barcodes$V1)
	write.table(
	barcodes,
	file = paste0(sample_bam, "_barcodes_corr.tsv"),
	sep = "\t",
	quote = F,
	col.names = F,
	row.names = F
	)

	# Make vcf of SNPs
	#system(paste0("source activate samtools
	# samtools mpileup -f ", genome , " -g ", sample_bam,
	# "_chrX.bam \| bcftools call -m > ", sample_bam, "_chrX.vcf
	# source deactivate"))

	# Subset to heterozygous SNPs
	#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
	# Reattach header
	#system(paste0('grep "#" ', sample_bam, '_chrX.vcf \| cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

	## Function for one run

	one_run <- function(i, per_i) {
	library(vcfR) # put library calls outside of the function
	library(dplyr) # ^^

	message(paste0("Iteration: ", i))

	# Prevent all parallel runs from having the same starting seed
	Sys.sleep(i)

	# Subsample the bam file at percentage and then index
	system(
	paste0(
	"source activate samtools
	samtools view -s ",
	per_i + i ,
	" -b ",
	sample_bam,
	"_chrX.bam > sub_sampled_tmp_",
	i,
	".bam
	source deactivate"
	)
	)
	system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

	# Apply CellSNP to subsampled file
	system(
	paste0(
	"source activate CellSNP
	cellSNP -s sub_sampled_tmp_",
	i,
	".bam -b ",
	sample_bam,
	"_barcodes_corr.tsv",
	" -o sub_sampled_tmp_",
	i,
	".snp -R ",
	sample_bam,
	"_chrX.het.corr.vcf -p 20
	source deactivate"
	)
	)

	# Count the number of SNPs and cells with SNPs
	vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"),
	verbose = FALSE)

	number_snps <- dim(vcf@fix)[1]

	vcf_tibble <- vcfR2tidy(vcf,
	format_fields = c("DP", "AD", "OTH", "ALL"))

	vcf_tibble_narm <- vcf_tibble$gt %>%
	mutate(present = gt_DP >= 0) %>%
	group_by(Indiv) %>%
	summarize(n_alleles = sum(present, na.rm = TRUE))

	number_cells_1_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles >= 1))[1]

	number_cells_2_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles >= 2))[1]

	# Remove temporary files
	system(paste0("rm _tmp_", i, ""))

	return(
	list(
	number_snps = number_snps,
	number_cells_1_snps = number_cells_1_snps,
	number_cells_2_snps = number_cells_2_snps
	)
	)

	} # end of function
	#' This function should be in a separate script that can be "source"'d.

	## All percentages to be investigated
	per <- seq(0.05, 0.95, 0.05)

	## Initialize objects to save
	#' this is a dangerous pattern - should try and specify the size of the list
	res <- list()

	for (i in 1:length(per)) {
	per_tmp <- per[i]

	cl <- makeCluster(5)
	clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
	res[[i]] <- parLapply(cl, 1:5,
	function(x){
	one_run(x, per_tmp)
	}
	) # make your function clearer

	stopCluster(cl)

	}

	saveRDS(res, file = paste0("Results_", outFile, ".rds"))

	# Make a dataframe from results averaged over 5 runs
	res1 <-
	data.frame(
	number_snps = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[1]]))),
	number_cells_1_snp = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[2]]))),
	number_cells_2_snps = sapply(res,
	function(x) mean(sapply(x,
	function(y) y[[3]]))),
	per = per
	)

	# Make a dataframe from results
	res2 <-
	data.frame(
	number_snps = c(sapply(res, function(x) sapply(x,
	function(y) y[[1]]))),
	number_cells_1_snp = c(sapply(res,
	function(x) sapply(x,
	function(y) y[[2]]))),
	number_cells_2_snps = c(sapply(res,
	function(x) sapply(x,
	function(y) y[[3]]))),
	per = rep(per, each = 5)
	)

	# Polynomial function
	f <- function(x, a, b, n) {
	(a * x ^ n) / (x ^ n + b)
	}

	# Fit polynomial function for number of cells with at least one SNP
	inital_value_n <-
	lm(log(number_cells_1_snp + 1) ~ log(per),
	res2)$coefficients[2]

	fit <-
	nls(
	number_cells_1_snp ~ f(per, a, b, n),
	data = res2,
	start = list(a = 4634,
	b = 1,
	n = inital_value_n)
	)

	# Fit polynomial function for number of cells with at least two SNPs
	inital_value_n_1 <-
	lm(log(number_cells_1_snp + 1) ~ log(per),
	res2)$coefficients[2]

	fit1 <-
	nls(
	number_cells_2_snps ~ f(per, a, b, n),
	data = res2,
	start = list(a = 4634,
	b = 1,
	n = inital_value_n_1)
	)

	# Fit polynomial function for number of SNPs
	inital_value_n_0 <-
	lm(log(number_cells_1_snp + 1) ~ log(per), res2)$coefficients[2]

	#' woh - why are are you using try here?
	try(fit0 <-
	nls(
	number_snps ~ f(per, a, b, n),
	data = res2,
	start = list(a = res2[dim(res2)[1], 2], b = 1, n = inital_value_n_0)
	))

	# Find range for extrapolation
	extr_end <- 1 + extr_seq + 0.5
	per_extr <- seq(0.05, extr_end, 0.01)

	# Make dataset for extrapolation
	res_extr <- data.frame(
	per = per_extr,
	number_cells_1_snp = sapply(per_extr,
	function(x){ f(x,
	coef(fit)[1],
	coef(fit)[2],
	coef(fit)[3])
	}
	),
	number_cells_2_snps = sapply(per_extr,
	function(x){
	f(x,
	coef(fit1)[1],
	coef(fit1)[2],
	coef(fit1)[3])
	}
	)
	)

	try(res_extr$number_snps <-
	sapply(per_extr,
	function(x){
	f(x,
	coef(fit0)[1],
	coef(fit0)[2],
	coef(fit0)[3])
	}
	)
	)

	# Plot results and output them
	png(paste0(outFile, "_cells_1_snp.png"))

	plot(res_extr$per,
	res_extr$number_cells_1_snp,
	xlab = "Percentage*100",
	ylab = "Number of cells with 1 SNP")
	points(res1$per,
	res1$number_cells_1_snp,
	col = "red",
	pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- max(res_extr$number_cells_1_snp) / 2
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]
	), 0) ,
	"cells detected"
	))
	dev.off()

	png(paste0(outFile, "_cells_2_snps.png"))

	plot(res_extr$per,
	res_extr$number_cells_2_snps,
	xlab = "Percentage*100",
	ylab = "Number of cells with 2 SNPs")
	points(res1$per,
	res1$number_cells_2_snps,
	col = "red",
	pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- max(res_extr$number_cells_2_snps) / 2
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]
	), 0) ,
	"cells detected"
	))
	dev.off()

	try({
	png(paste0(outFile, "_snps.png"))
	plot(res_extr$per,
	res_extr$number_snps,
	xlab = "Percentage*100",
	ylab = "Number ofSNPs")
	points(res1$per, res1$number_snps, col = "red", pch = 19)
	abline(v = extr_seq + 1)
	txt.x <- extr_end / 2
	txt.y <- 100
	text(txt.x,
	txt.y,
	labels = paste(
	"With",
	round(extr_seq, 2),
	" more reads: \n",
	round(f(
	1 + extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]
	), 0) ,
	"SNPs detected"
	))
	dev.off()
	}
	)
	library(parallel)

	# Do not output scientific notation
	options(scipen=999)

	# Initial garbage collection
	gc()

	# Set command line arguments
	args <- commandArgs(TRUE)

	bam <- args[1]
	sample_bam <- gsub(".bam", "", bam)
	extr_seq <- as.numeric(args[2])
	outFile <- args[3]
	genome = "/dd_userdata/usrdat02/userdata/sfreytag/indexes/hg19.premrna/fasta/genome.fa"

	# Prepare files
	# First subset to only chromosome X
	#system(paste0("samtools view -h ", bam, " X > ", sample_bam, "_chrX.bam"))

	# Remove additional tags on barcodes
	barcodes <- read.table(paste0(sample_bam ,"_barcodes.tsv"), header=F, sep="\t")
	barcodes$V1 <- gsub("-1", "", barcodes$V1)
	aa <- length(barcodes$V1)
	write.table(barcodes, file=paste0(sample_bam, "_barcodes_corr.tsv"), sep="\t",
	quote=F, col.names=F, row.names=F)

	# Make vcf of SNPs
	#system(paste0("source activate samtools
	# samtools mpileup -f ", genome , " -g ", sample_bam,
	# "_chrX.bam \| bcftools call -m > ", sample_bam, "_chrX.vcf
	# source deactivate"))

	# Subset to heterozygous SNPs
	#system(paste0('grep "0/1" ', sample_bam, '_chrX.vcf > ', sample_bam, '_chrX.het.vcf'))
	# Reattach header
	#system(paste0('grep "#" ', sample_bam, '_chrX.vcf \| cat - ', sample_bam, '_chrX.het.vcf > ', sample_bam, '_chrX.het.corr.vcf'))

	## Function for one run

	one_run <- function(i, per_i) {

	library(vcfR)
	library(dplyr)

	print(paste0("Iteration: ", i))

	# Prevent all parallel runs from having the same starting seed
	Sys.sleep(i)

	# Subsample the bam file at percentage and then index
	system(paste0("source activate samtools
	samtools view -s ", per_i+i , " -b ", sample_bam, "_chrX.bam > sub_sampled_tmp_", i, ".bam
	source deactivate"))
	system(paste0("samtools index sub_sampled_tmp_", i, ".bam"))

	# Apply CellSNP to subsampled file
	system(paste0("source activate CellSNP
	cellSNP -s sub_sampled_tmp_", i, ".bam -b ",
	sample_bam, "_barcodes_corr.tsv",
	" -o sub_sampled_tmp_", i, ".snp -R ", sample_bam, "_chrX.het.corr.vcf -p 20
	source deactivate"))

	# Count the number of SNPs and cells with SNPs
	vcf <- read.vcfR(paste0("sub_sampled_tmp_", i, ".snp.gz"), verbose = FALSE)
	number_snps <- dim(vcf@fix) [1]
	vcf_tibble <- vcfR2tidy(vcf, format_fields = c("DP", "AD", "OTH", "ALL"))
	vcf_tibble_narm <- vcf_tibble$gt %>%
	mutate(present=gt_DP>=0) %>% group_by(Indiv) %>%
	summarize(n_alleles=sum(present, na.rm=TRUE))
	number_cells_1_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles>=1))[1]
	number_cells_2_snps <- dim(vcf_tibble_narm %>%
	ungroup() %>%
	filter(n_alleles>=2))[1]

	# Remove temporary files
	system(paste0("rm _tmp_", i, ""))

	return(list(number_snps=number_snps, number_cells_1_snps=number_cells_1_snps,
	number_cells_2_snps=number_cells_2_snps))

	}

	## All percentages to be investigated
	per <- seq(0.05,0.95,0.05)

	## Initialize objects to save
	res <- list()

	for (i in 1:length(per)){

	per_tmp <- per[i]

	cl <- makeCluster(5)
	clusterExport(cl, c("per_tmp", "sample_bam", "one_run"))
	res[[i]] <- parLapply(cl, 1:5, function(x) one_run(x, per_tmp))
	stopCluster(cl)

	}

	saveRDS(res, file=paste0("Results_", outFile, ".rds"))

	# Make a dataframe from results averaged over 5 runs
	res1 <- data.frame(number_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[1]]))),
	number_cells_1_snp = sapply(res, function(x) mean(sapply(x, function(y) y[[2]]))),
	number_cells_2_snps = sapply(res, function(x) mean(sapply(x, function(y) y[[3]]))),
	per = per)

	# Make a dataframe from results
	res2 <- data.frame(number_snps = c(sapply(res, function(x) sapply(x, function(y) y[[1]]))),
	number_cells_1_snp = c(sapply(res, function(x) sapply(x, function(y) y[[2]]))),
	number_cells_2_snps = c(sapply(res, function(x) sapply(x, function(y) y[[3]]))),
	per = rep(per, each=5))

	# Polynomial function
	f <- function(x, a, b, n) {
	(a*x^n)/ (x^n+b)
	}

	# Fit polynomial function for number of cells with at least one SNP
	inital_value_n <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	fit <- nls(number_cells_1_snp~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n))

	# Fit polynomial function for number of cells with at least two SNPs
	inital_value_n_1 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	fit1 <- nls(number_cells_2_snps~f(per,a,b,n), data=res2, start=list(a=4634, b=1, n=inital_value_n_1))

	# Fit polynomial function for number of SNPs
	inital_value_n_0 <- lm(log(number_cells_1_snp+1)~log(per), res2)$coefficients[2]
	try(fit0 <- nls(number_snps~f(per,a,b,n), data=res2, start=list(a=res2[dim(res2)[1],2], b=1, n=inital_value_n_0)))

	# Find range for extrapolation
	extr_end <- 1 + extr_seq + 0.5
	per_extr <- seq(0.05, extr_end, 0.01)

	# Make dataset for extrapolation
	res_extr <- data.frame(per=per_extr,
	number_cells_1_snp=sapply(per_extr, function(x) f(x, coef(fit)[1], coef(fit)[2], coef(fit)[3])),
	number_cells_2_snps=sapply(per_extr, function(x) f(x, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3])))

	try(res_extr$number_snps <- sapply(per_extr, function(x) f(x, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3])))

	# Plot results and output them
	png(paste0(outFile, "_cells_1_snp.png"))
	plot(res_extr$per, res_extr$number_cells_1_snp, xlab="Percentage*100", ylab="Number of cells with 1 SNP")
	points(res1$per, res1$number_cells_1_snp, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- max(res_extr$number_cells_1_snp)/2
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit)[1], coef(fit)[2], coef(fit)[3]),0) ,"cells detected"))
	dev.off()

	png(paste0(outFile, "_cells_2_snps.png"))
	plot(res_extr$per, res_extr$number_cells_2_snps, xlab="Percentage*100", ylab="Number of cells with 2 SNPs")
	points(res1$per, res1$number_cells_2_snps, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- max(res_extr$number_cells_2_snps)/2
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit1)[1], coef(fit1)[2], coef(fit1)[3]),0) ,"cells detected"))
	dev.off()

	try({png(paste0(outFile, "_snps.png"))
	plot(res_extr$per, res_extr$number_snps, xlab="Percentage*100", ylab="Number ofSNPs")
	points(res1$per, res1$number_snps, col="red", pch=19)
	abline(v=extr_seq+1)
	txt.x <- extr_end/2
	txt.y <- 100
	text(txt.x, txt.y,
	labels=paste("With", round(extr_seq,2), " more reads: \n",
	round(f(1+extr_seq, coef(fit0)[1], coef(fit0)[2], coef(fit0)[3]),0) ,"SNPs detected"))
	dev.off()})