thanhleviet · June 12, 2019 22:40
diff --git a/plot_depth.r b/plot_depth.r
 library(readr)
 library(dplyr)

 add.alpha <- function(col, alpha=1){
  if(missing(col))
    stop("Please provide a vector of colours.")
  apply(sapply(col, col2rgb)/255, 2,
        function(x)
          rgb(x[1], x[2], x[3], alpha=alpha))
 }

 bed_files <- list.files("depth_batch1/","*.bed", full.names = T)


 batch1 <- grep("depth//[0-9]{3}", bed_files, value = T)
 batch1_name <- gsub("depth_batch3//|_recal_zero_genomecov.bed", "",batch1)
 batch2 <- grep("depth_batch1//[0-9]{1,2}_", bed_files, value = T)
 batch2_name <- gsub("depth_batch1//|_recal_zero_genomecov.bed", "",batch2)
 # bed <- read_delim(bed_files[2], delim = "\t", col_names = c("seq", "pos", "depth"))

 beds1 <- lapply(batch1, function(x) read_delim(x, delim = "\t", col_names = c("seq", "pos", "depth")))

 beds2 <- lapply(batch2, function(x) read_delim(x, delim = "\t", col_names = c("seq", "pos", "depth")))

 # png(filename = "doc.png", width = 1200, height = 800, units = "px", type = "cairo", antialias = "subpixel")
 # seq_300 <- map_lgl(beds1, function(x) sum(x$depth==0) > 300)

 # new_beds <- beds[seq_300]

 # with(bed, plot(pos, depth, type = "l", ylim = c(-1, 7000), col = "grey90"))

 # for (i in seq_along(beds)[1:3]) {
 #   print(i)
 #   with(beds[[i]],
 #        polygon(c(pos[1], pos, pos[10690]),c(min(depth), depth, min(depth)),
 #                border = NA,
 #                col = "grey90"))
 # }
 beds <- beds2
 low_cov <- lapply(beds, function(x) x %>% filter(seq != "DENV2", depth < 10) %>% nrow() > 0) %>% unlist

 beds <- lapply(beds[low_cov], function(x) x %>% filter(depth < 10))
 y_max <- length(beds) + 10

 beds_avg <- bind_rows(beds2) %>%
  group_by(pos) %>%
  summarise(mean_depth = mean(depth),
            median_depth = median(depth),
            min_depth = min(depth),
            max_depth = max(depth))
 # png(filename = "batch1.png", width = 2830, height = 1390, type = "quartz", pointsize = 36)
 dev.off()
 # layout(mat = matrix(c(1,1,2,2), 2,2, byrow = TRUE), heights = c(4,2), FALSE)
 # layout.show(10)
 # par(mar = c(1,4,2,1) + 0.1, oma = c(0,0,0,0))
 par(mar = c(5,4,2,4))
 plot(0,0, ylim = c(0,y_max), xlim = c(1,11000), type = "n", ylab = "Sample", xlab = "Locus", axes=FALSE, frame.plot=FALSE, main = "Batch 2")

 # kolor <- distinctColorPalette(nrow(den1_ft))
 kolor <- c("grey80", "#DD6179", "#7FDBD2", "#7ADD99", "#7EADD7", "#A543E5", "#82E557", "#D2DCA2", "#D6DBDD", "#C7A095", "#DC61CB", "#D6A2D5", "#8278D5", "grey80")

 for (i in seq_len(nrow(den1_ft))){
  yy <- c(0, y_max, y_max, 0)
  start <- den1_ft[i,]$start
  end <- den1_ft[i,]$end
  gene <- den1_ft$product1
  mid <- den1_ft$mid
  xx <- c(start, start, end, end)
  polygon(xx, yy, col = add.alpha(kolor[i], 0.75), border = NA)
  text(mid, y_max, gene, srt = 90, adj = 1)
 }

 for (i in seq_along(beds)){
  abline(h = i, lty = 3, col = add.alpha("grey40", 0.2))
  points(x = beds[[i]]$pos, y = rep(i, nrow(beds[[i]])), col = add.alpha("black", 0.09), pch = 19, cex = 0.5)
 }

 axis(side=2, at = seq(1,95,10), las = 2, lwd = 2)
 axis(side=1, at = seq(0,11000,1000), pos = -1, lwd = 2)

 # par(mar = c(4.1,4.1,0,1.1))
 par(new = T)
 with(beds_avg, plot(x = pos, y = median_depth, type = "l", axes = F, frame.plot=FALSE , xlab = NA, ylab = NA, col = add.alpha("red", 0.8)))
 max_cov <- max(beds_avg$median_depth) + 100
 axis(side=4, at = seq(0,max_cov,500), pos = 10800, las = 2, col = add.alpha("red", 0.8), col.axis = add.alpha("red", 0.8), lwd = 2)
 mtext(side = 4, line = 2, "Median of Coverage", col = add.alpha("red", 0.8))

 ################################################################
 beds1a <- lapply(beds2, function (x)
    x %>% mutate(pos = as.numeric(pos), product = case_when(
    (pos >= 73 & pos <= 413) ~ "anchored capsid protein",
    (pos >= 415 & pos <= 906) ~ "membrane precursor",
    (pos >= 907 & pos <= 2397) ~  "E gene",
    (pos >= 2398 & pos <= 3453) ~  "NS1",
    (pos >= 3454 & pos <= 4107) ~  "NS2A",
    (pos >= 4108 & pos <= 4497) ~  "NS2B",
    (pos >= 4498 & pos <= 6354) ~  "NS3",
    (pos >= 6355 & pos <= 6735) ~  "NS4A",
    (pos >= 6736 & pos <= 6804) ~  "2K",
    (pos >= 6805 & pos <= 7551) ~  "NS4B",
    (pos >= 7552 & pos <= 10248) ~  "NS5",
    TRUE ~ "non-CDS"
  )) %>%
    filter(depth < 10) %>%
    group_by(product) %>%
    summarise(counts = n()) %>%
    spread(key = product, value = counts, fill = "0")
  ) %>%
  bind_rows() %>%
  mutate(sample = batch2_name) %>%
  select(sample, everything())

 write_csv(beds1a, "batch2_less_than_10x.csv")


 ###Average coverage

 avg_cov <- lapply(beds2, function(x) sum(x$depth/nrow(x))) %>% unlist()
 avg_cov_df <- data.frame(sample = batch2_name, avg_cov = avg_cov)
 write_csv(avg_cov_df, "batch_2_avg_cov.csv")
	library(readr)
	library(dplyr)

	add.alpha <- function(col, alpha=1){
	if(missing(col))
	stop("Please provide a vector of colours.")
	apply(sapply(col, col2rgb)/255, 2,
	function(x)
	rgb(x[1], x[2], x[3], alpha=alpha))
	}

	bed_files <- list.files("depth_batch1/","*.bed", full.names = T)


	batch1 <- grep("depth//[0-9]{3}", bed_files, value = T)
	batch1_name <- gsub("depth_batch3//\|_recal_zero_genomecov.bed", "",batch1)
	batch2 <- grep("depth_batch1//[0-9]{1,2}_", bed_files, value = T)
	batch2_name <- gsub("depth_batch1//\|_recal_zero_genomecov.bed", "",batch2)
	# bed <- read_delim(bed_files[2], delim = "\t", col_names = c("seq", "pos", "depth"))

	beds1 <- lapply(batch1, function(x) read_delim(x, delim = "\t", col_names = c("seq", "pos", "depth")))

	beds2 <- lapply(batch2, function(x) read_delim(x, delim = "\t", col_names = c("seq", "pos", "depth")))

	# png(filename = "doc.png", width = 1200, height = 800, units = "px", type = "cairo", antialias = "subpixel")
	# seq_300 <- map_lgl(beds1, function(x) sum(x$depth==0) > 300)

	# new_beds <- beds[seq_300]

	# with(bed, plot(pos, depth, type = "l", ylim = c(-1, 7000), col = "grey90"))

	# for (i in seq_along(beds)[1:3]) {
	# print(i)
	# with(beds[[i]],
	# polygon(c(pos[1], pos, pos[10690]),c(min(depth), depth, min(depth)),
	# border = NA,
	# col = "grey90"))
	# }
	beds <- beds2
	low_cov <- lapply(beds, function(x) x %>% filter(seq != "DENV2", depth < 10) %>% nrow() > 0) %>% unlist

	beds <- lapply(beds[low_cov], function(x) x %>% filter(depth < 10))
	y_max <- length(beds) + 10

	beds_avg <- bind_rows(beds2) %>%
	group_by(pos) %>%
	summarise(mean_depth = mean(depth),
	median_depth = median(depth),
	min_depth = min(depth),
	max_depth = max(depth))
	# png(filename = "batch1.png", width = 2830, height = 1390, type = "quartz", pointsize = 36)
	dev.off()
	# layout(mat = matrix(c(1,1,2,2), 2,2, byrow = TRUE), heights = c(4,2), FALSE)
	# layout.show(10)
	# par(mar = c(1,4,2,1) + 0.1, oma = c(0,0,0,0))
	par(mar = c(5,4,2,4))
	plot(0,0, ylim = c(0,y_max), xlim = c(1,11000), type = "n", ylab = "Sample", xlab = "Locus", axes=FALSE, frame.plot=FALSE, main = "Batch 2")

	# kolor <- distinctColorPalette(nrow(den1_ft))
	kolor <- c("grey80", "#DD6179", "#7FDBD2", "#7ADD99", "#7EADD7", "#A543E5", "#82E557", "#D2DCA2", "#D6DBDD", "#C7A095", "#DC61CB", "#D6A2D5", "#8278D5", "grey80")

	for (i in seq_len(nrow(den1_ft))){
	yy <- c(0, y_max, y_max, 0)
	start <- den1_ft[i,]$start
	end <- den1_ft[i,]$end
	gene <- den1_ft$product1
	mid <- den1_ft$mid
	xx <- c(start, start, end, end)
	polygon(xx, yy, col = add.alpha(kolor[i], 0.75), border = NA)
	text(mid, y_max, gene, srt = 90, adj = 1)
	}

	for (i in seq_along(beds)){
	abline(h = i, lty = 3, col = add.alpha("grey40", 0.2))
	points(x = beds[[i]]$pos, y = rep(i, nrow(beds[[i]])), col = add.alpha("black", 0.09), pch = 19, cex = 0.5)
	}

	axis(side=2, at = seq(1,95,10), las = 2, lwd = 2)
	axis(side=1, at = seq(0,11000,1000), pos = -1, lwd = 2)

	# par(mar = c(4.1,4.1,0,1.1))
	par(new = T)
	with(beds_avg, plot(x = pos, y = median_depth, type = "l", axes = F, frame.plot=FALSE , xlab = NA, ylab = NA, col = add.alpha("red", 0.8)))
	max_cov <- max(beds_avg$median_depth) + 100
	axis(side=4, at = seq(0,max_cov,500), pos = 10800, las = 2, col = add.alpha("red", 0.8), col.axis = add.alpha("red", 0.8), lwd = 2)
	mtext(side = 4, line = 2, "Median of Coverage", col = add.alpha("red", 0.8))

	################################################################
	beds1a <- lapply(beds2, function (x)
	x %>% mutate(pos = as.numeric(pos), product = case_when(
	(pos >= 73 & pos <= 413) ~ "anchored capsid protein",
	(pos >= 415 & pos <= 906) ~ "membrane precursor",
	(pos >= 907 & pos <= 2397) ~ "E gene",
	(pos >= 2398 & pos <= 3453) ~ "NS1",
	(pos >= 3454 & pos <= 4107) ~ "NS2A",
	(pos >= 4108 & pos <= 4497) ~ "NS2B",
	(pos >= 4498 & pos <= 6354) ~ "NS3",
	(pos >= 6355 & pos <= 6735) ~ "NS4A",
	(pos >= 6736 & pos <= 6804) ~ "2K",
	(pos >= 6805 & pos <= 7551) ~ "NS4B",
	(pos >= 7552 & pos <= 10248) ~ "NS5",
	TRUE ~ "non-CDS"
	)) %>%
	filter(depth < 10) %>%
	group_by(product) %>%
	summarise(counts = n()) %>%
	spread(key = product, value = counts, fill = "0")
	) %>%
	bind_rows() %>%
	mutate(sample = batch2_name) %>%
	select(sample, everything())

	write_csv(beds1a, "batch2_less_than_10x.csv")


	###Average coverage

	avg_cov <- lapply(beds2, function(x) sum(x$depth/nrow(x))) %>% unlist()
	avg_cov_df <- data.frame(sample = batch2_name, avg_cov = avg_cov)
	write_csv(avg_cov_df, "batch_2_avg_cov.csv")