hbhutta · December 23, 2024 18:22
diff --git a/rnaseq_utils.R b/rnaseq_utils.R
 library(limma)
 library(edgeR)
 library(sva)
 library(ggplot2)
 library(tidyr)
 library(broom)
 library(dplyr)
 library(ggpubr)
 library(biomaRt)
 options(stringsAsFactors = F)

 sig_info <- function(dges) {
  print(sprintf(
    "#. of DGEs that have adjusted p-value < 0.05: %s",
    sum(dges$adj.P.Val < 0.05)
  ))
  print(sprintf(
    "#. of DGEs that have adjusted p-value < 0.05 AND abs(logFC) >= 1: %s",
    sum(dges$adj.P.Val < 0.05 & abs(dges$logFC) >= 1)
  ))
 }
 annotate <- function(dges, annotation) {
  annotation <- rename(annotation, ens_gene_id = gene_id)
  dges <- dges %>% inner_join(annotation, by = join_by(ens_gene_id))
  return(dges)
 }
 plot_PCA <- function(pca, lcpm, title, path, group_or_batch) {
    ggsave(
    path,
    ggbiplot::ggbiplot(
      pca,
      obs.scale = 1,
      var.scale = 1,
      labels = colnames(lcpm),
      ellipse = TRUE,
      ellipse.linewidth = 1,
      ellipse.fill = TRUE,
      ellipse.alpha = 0.2,
      varname.size = TRUE,
      pc.biplot = FALSE,
      var.axes = FALSE,
      circle = FALSE,
      group = group_or_batch 
    ) +
      theme_bw() +
      theme(legend.position = 'bottom') +
      theme(plot.title = element_text(hjust = 0.5)) + # Center title
      scale_color_manual(values = c("royalblue2", "red3")) + # ellipse border colors
      scale_fill_manual(values = c("cyan", "red1")) + # ellipse fill color
      ggtitle(title) +
      labs(
        
        x = paste0("PC1 (", round(summary(pca)$importance[2, 1] * 100, 2), "%)"),
        y = paste0("PC2 (", round(summary(pca)$importance[2, 2] * 100, 2), "%)"),
        color = "Treatment",
      ) +
      theme(
        axis.title = element_text(face = "bold", size = "14"),
        axis.text = element_text(face = "bold", size = "14"),
        legend.text = element_text(face = "bold", size = "11"),
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()
      ) +
      guides(color = guide_legend(nrow = 1, byrow = TRUE)),
    height = 8,
    width = 8
  )
 }

 get_filtered_lcpm <- function(samples, group) {
  if (length(unique(group)) < 2) {
    dge <- DGEList(counts = samples)
    keep <- filterByExpr(dge)
    dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
    dge <- calcNormFactors(dge)
    v <- voom(dge, plot = T)
    lcpm <- v$E
    return(lcpm)
  }
  # 2-step filtering
  dge <- DGEList(counts = samples, group = group)
  design <- model.matrix( ~ group)
  keep <- filterByExpr(dge, design = design)
  dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
  dge <- calcNormFactors(dge)
  
  v <- voom(dge, plot = T)
  lcpm <- v$E
  
  group_labels <- unique(group)
  print(group_labels)
  n_1 <- length(group[group == group_labels[1]])
  n_2 <- length(group[group == group_labels[2]])
  size_of_smallest_group <- ifelse(n_1 < n_2, n_1, n_2)
  
  keep <- rownames(lcpm)[!(rowSums(lcpm < 1) > size_of_smallest_group)] # Second filtering step
  lcpm <- lcpm[keep, ]
  print(nrow(lcpm))
  return(lcpm)
 }
 get_dvol <- function(dges) {
   dvol <- dges %>%
    mutate(direction = ifelse(!(adj.P.Val < 0.05), "nonsig", ifelse(
      logFC >= 1, "up", ifelse(logFC <= -1, "dn", "lowdiff")
    )))
   
   print("Table summary of DGEs direction:")
   print(table(dvol$direction))
   
   return(dvol)
 }
 perform_DGE_limma <- function(lcpm, group, annotation) {
  design <- model.matrix(~group)
  fit <- lmFit(lcpm, design = design)
  toptab <- topTable(
    fit = eBayes(fit),
    coef = ncol(design),
    number = Inf,
  ) 
  
  toptab <- toptab %>%
    mutate(ens_gene_id = rownames(toptab)) %>%
    dplyr::select(-t, -B, -AveExpr, P.Val = P.Value) #%>% # we do not need the information from these columns
  
  toptab <- annotate(toptab, annotation)
  
   print(sprintf(
    "Found %d statistically significant DGEs using limma::lmFit",
    nrow(toptab)
    ))
  sig_info(toptab)
  
  return(toptab)
 }

 perform_DGE_wilcoxon <- function(lcpm, group, annotation) {
  gexp_by_gene_id_and_sample <- as.data.frame(t(lcpm)) %>%
    mutate(sample = row.names(.)) %>%
    gather(ens_gene_id, gene_exp, -sample) # Exclude sample from gathering
  
  metadata <- data.frame(sample = colnames(lcpm), from_group = group)
  
  # Annotate using metadata
  gexp_with_group_info <- inner_join(gexp_by_gene_id_and_sample, metadata, by = join_by(sample))

  summ_stats <- gexp_with_group_info %>%
    group_by(ens_gene_id, from_group) %>%
    summarise(mean_gene_exp = mean(gene_exp), .groups = 'drop') %>%
    as.data.frame() %>%
    pivot_wider(names_from = from_group, values_from = mean_gene_exp)
  
  print(
    sprintf(
      "Treating group label %s as disease/reference group in logFC calculation",
      colnames(summ_stats)[3]
    )
  )
  print(
    sprintf(
      "Calculating logFC as mean_expr(%s) - mean_expr(%s) for each gene...",
      colnames(summ_stats)[3],
      colnames(summ_stats)[2]
    )
  )
 #  if (half) {
 #    print("Halving logFC")
 #    summ_stats <- summ_stats %>% mutate(logFC = (.[[3]] - .[[2]]) / 2)
 #  } else {
 #    summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
 #  }
  summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
  print("Calculated logFC") 

  wilcox <- gexp_with_group_info %>%
    group_by(ens_gene_id) %>%
    do(tidy(wilcox.test(gene_exp ~ from_group, data = .))) %>%
    as.data.frame() %>%
    rename(P.Val = p.value) %>%
    mutate(adj.P.Val = p.adjust(P.Val, method = "BH")) %>% # benjamini-hochberg MH correction
    inner_join(summ_stats, by = join_by(ens_gene_id)) 
  
  dges <- wilcox #%>%
    #dplyr::select(ens_gene_id, P.Val, adj.P.Val, logFC)
  
  if (!(is.null(annotation))) {
   dges <- annotate(dges, annotation)     
  }

  print(sprintf("Found %d statistically significant DGEs using Wilcoxon", nrow(dges)))
  
  sig_info(dges)
  return(dges)
 }

 plot_dvol <- function(dges, path, title) {
  dvol <- get_dvol(dges)
  print(colnames(dvol))
  pcut <- filter(dges, adj.P.Val < 0.05) %>% with(P.Val) %>% max()
  labup <- filter(dvol, direction == 'up') #%>% slice(1:10)
  labdn <- filter(dvol, direction == 'dn') #%>% slice(1:10)
  
  # unique(direction) should give up dn lowdiff nonsig (not necessarily in this order)
  # up: top right box
  # dn: top left box
  # lowdiff (above pcut but low log FC) top middle box
  # nonsig (below pcut, any logFC) all three bottom boxes
  
  dvol_plot <- ggplot(dvol, aes(x = logFC, y = -log10(P.Val))) +
    geom_point(aes(color = direction, fill = direction)) +
    # !!! color maps are based on unique values in X column of df where X is mentioned in color = X
    ggrepel::geom_text_repel(
      aes(label = gene_name),
      data = labup,
      fontface = 'italic',
      max.overlaps = 16 # 12
    ) +
    ggtitle(title) +
    ggrepel::geom_text_repel(aes(label = gene_name), data = labdn, fontface = 'italic') +
    scale_color_manual(values = c(
      "up" = 'red',
      "dn" = 'blue',
      "lowdiff" = 'grey',
      "nonsig" = 'grey'
    )) +
    geom_vline(xintercept = 1, linetype = 'dotted') + # diff log cpm > 1 = upregulated
    geom_vline(xintercept = -1, linetype = 'dotted') + # diff log cpm < 1 = downregulated
    geom_hline(yintercept = -log10(pcut), linetype = 'dotted') +
    ylab(as.expression(bquote(paste(-log[10], 'P')))) +
    theme_classic(base_size = 15) +
    theme(legend.position = 'none') +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(
      axis.title = element_text(face = "bold", size = "14"),
      axis.text = element_text(face = "bold", size = "14"),
      legend.text = element_text(face = "bold", size = "11"),
      legend.title = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      plot.title = element_text(face = "bold", size = 10)
    )
  if (!(path == "")) {
    print(sprintf("Saving plot to path: %s", path))
    ggplot2::ggsave(path, dvol_plot, width = 8, height = 8)
  }
 }

 get_ora_ids <- function(dges, path) {
 # Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1) 
  ora <- data.frame(ens_gene_id = dges$ens_gene_id)
  if (!(path == "")) {
    write.table(
      ora$ens_gene_id,
      path,
      sep = "\t",
      row.names = FALSE,
      col.names = FALSE,
      quote = FALSE
    ) 
  }
 }

 get_gsea_rnk <- function(dges, path) {
  # Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1) 
  gsea <- data.frame(
    ens_gene_id = dges$ens_gene_id,
    rankings = sign(dges$logFC) * (-log10(dges$P.Val))
  )
  write.table(
    gsea,
    path,
    sep = "\t",
    row.names = FALSE,
    col.names = FALSE,
    quote = FALSE
  ) 
 }

 get_sig_dges <- function(dges) {
  sig <- dges %>% filter(abs(logFC) >= 1) %>% filter(adj.P.Val < 0.05)
  return(sig)
 }

 get_pig_genes <- function() {
  mart <- useMart("ENSEMBL_MART_ENSEMBL", host = "https://may2024.archive.ensembl.org/")
  pigMart <- useMart("ENSEMBL_MART_ENSEMBL",
                     dataset = "sscrofa_gene_ensembl",
                     host =
                       "https://may2024.archive.ensembl.org/")
  
  atts <- listAttributes(pigMart) # list all attributes
  piggenes <- getBM(
    c(
      'ensembl_gene_id',
      'external_gene_name',
      'hsapiens_homolog_ensembl_gene',
      'hsapiens_homolog_associated_gene_name'
    ),
    mart = pigMart
  ) %>% dplyr::select(
    ens_gene_id = ensembl_gene_id,
    human_ensembl_gene_id = hsapiens_homolog_ensembl_gene,
    pig_gene_name = external_gene_name,
    human_gene_name = hsapiens_homolog_associated_gene_name
  ) %>%
    filter(!(human_ensembl_gene_id == ""))
 }
	library(limma)
	library(edgeR)
	library(sva)
	library(ggplot2)
	library(tidyr)
	library(broom)
	library(dplyr)
	library(ggpubr)
	library(biomaRt)
	options(stringsAsFactors = F)

	sig_info <- function(dges) {
	print(sprintf(
	"#. of DGEs that have adjusted p-value < 0.05: %s",
	sum(dges$adj.P.Val < 0.05)
	))
	print(sprintf(
	"#. of DGEs that have adjusted p-value < 0.05 AND abs(logFC) >= 1: %s",
	sum(dges$adj.P.Val < 0.05 & abs(dges$logFC) >= 1)
	))
	}
	annotate <- function(dges, annotation) {
	annotation <- rename(annotation, ens_gene_id = gene_id)
	dges <- dges %>% inner_join(annotation, by = join_by(ens_gene_id))
	return(dges)
	}
	plot_PCA <- function(pca, lcpm, title, path, group_or_batch) {
	ggsave(
	path,
	ggbiplot::ggbiplot(
	pca,
	obs.scale = 1,
	var.scale = 1,
	labels = colnames(lcpm),
	ellipse = TRUE,
	ellipse.linewidth = 1,
	ellipse.fill = TRUE,
	ellipse.alpha = 0.2,
	varname.size = TRUE,
	pc.biplot = FALSE,
	var.axes = FALSE,
	circle = FALSE,
	group = group_or_batch
	) +
	theme_bw() +
	theme(legend.position = 'bottom') +
	theme(plot.title = element_text(hjust = 0.5)) + # Center title
	scale_color_manual(values = c("royalblue2", "red3")) + # ellipse border colors
	scale_fill_manual(values = c("cyan", "red1")) + # ellipse fill color
	ggtitle(title) +
	labs(

	x = paste0("PC1 (", round(summary(pca)$importance[2, 1] * 100, 2), "%)"),
	y = paste0("PC2 (", round(summary(pca)$importance[2, 2] * 100, 2), "%)"),
	color = "Treatment",
	) +
	theme(
	axis.title = element_text(face = "bold", size = "14"),
	axis.text = element_text(face = "bold", size = "14"),
	legend.text = element_text(face = "bold", size = "11"),
	legend.title = element_blank(),
	panel.grid.major = element_blank(),
	panel.grid.minor = element_blank()
	) +
	guides(color = guide_legend(nrow = 1, byrow = TRUE)),
	height = 8,
	width = 8
	)
	}

	get_filtered_lcpm <- function(samples, group) {
	if (length(unique(group)) < 2) {
	dge <- DGEList(counts = samples)
	keep <- filterByExpr(dge)
	dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
	dge <- calcNormFactors(dge)
	v <- voom(dge, plot = T)
	lcpm <- v$E
	return(lcpm)
	}
	# 2-step filtering
	dge <- DGEList(counts = samples, group = group)
	design <- model.matrix( ~ group)
	keep <- filterByExpr(dge, design = design)
	dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
	dge <- calcNormFactors(dge)

	v <- voom(dge, plot = T)
	lcpm <- v$E

	group_labels <- unique(group)
	print(group_labels)
	n_1 <- length(group[group == group_labels[1]])
	n_2 <- length(group[group == group_labels[2]])
	size_of_smallest_group <- ifelse(n_1 < n_2, n_1, n_2)

	keep <- rownames(lcpm)[!(rowSums(lcpm < 1) > size_of_smallest_group)] # Second filtering step
	lcpm <- lcpm[keep, ]
	print(nrow(lcpm))
	return(lcpm)
	}
	get_dvol <- function(dges) {
	dvol <- dges %>%
	mutate(direction = ifelse(!(adj.P.Val < 0.05), "nonsig", ifelse(
	logFC >= 1, "up", ifelse(logFC <= -1, "dn", "lowdiff")
	)))

	print("Table summary of DGEs direction:")
	print(table(dvol$direction))

	return(dvol)
	}
	perform_DGE_limma <- function(lcpm, group, annotation) {
	design <- model.matrix(~group)
	fit <- lmFit(lcpm, design = design)
	toptab <- topTable(
	fit = eBayes(fit),
	coef = ncol(design),
	number = Inf,
	)

	toptab <- toptab %>%
	mutate(ens_gene_id = rownames(toptab)) %>%
	dplyr::select(-t, -B, -AveExpr, P.Val = P.Value) #%>% # we do not need the information from these columns

	toptab <- annotate(toptab, annotation)

	print(sprintf(
	"Found %d statistically significant DGEs using limma::lmFit",
	nrow(toptab)
	))
	sig_info(toptab)

	return(toptab)
	}

	perform_DGE_wilcoxon <- function(lcpm, group, annotation) {
	gexp_by_gene_id_and_sample <- as.data.frame(t(lcpm)) %>%
	mutate(sample = row.names(.)) %>%
	gather(ens_gene_id, gene_exp, -sample) # Exclude sample from gathering

	metadata <- data.frame(sample = colnames(lcpm), from_group = group)

	# Annotate using metadata
	gexp_with_group_info <- inner_join(gexp_by_gene_id_and_sample, metadata, by = join_by(sample))

	summ_stats <- gexp_with_group_info %>%
	group_by(ens_gene_id, from_group) %>%
	summarise(mean_gene_exp = mean(gene_exp), .groups = 'drop') %>%
	as.data.frame() %>%
	pivot_wider(names_from = from_group, values_from = mean_gene_exp)

	print(
	sprintf(
	"Treating group label %s as disease/reference group in logFC calculation",
	colnames(summ_stats)[3]
	)
	)
	print(
	sprintf(
	"Calculating logFC as mean_expr(%s) - mean_expr(%s) for each gene...",
	colnames(summ_stats)[3],
	colnames(summ_stats)[2]
	)
	)
	# if (half) {
	# print("Halving logFC")
	# summ_stats <- summ_stats %>% mutate(logFC = (.[[3]] - .[[2]]) / 2)
	# } else {
	# summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
	# }
	summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
	print("Calculated logFC")

	wilcox <- gexp_with_group_info %>%
	group_by(ens_gene_id) %>%
	do(tidy(wilcox.test(gene_exp ~ from_group, data = .))) %>%
	as.data.frame() %>%
	rename(P.Val = p.value) %>%
	mutate(adj.P.Val = p.adjust(P.Val, method = "BH")) %>% # benjamini-hochberg MH correction
	inner_join(summ_stats, by = join_by(ens_gene_id))

	dges <- wilcox #%>%
	#dplyr::select(ens_gene_id, P.Val, adj.P.Val, logFC)

	if (!(is.null(annotation))) {
	dges <- annotate(dges, annotation)
	}

	print(sprintf("Found %d statistically significant DGEs using Wilcoxon", nrow(dges)))

	sig_info(dges)
	return(dges)
	}

	plot_dvol <- function(dges, path, title) {
	dvol <- get_dvol(dges)
	print(colnames(dvol))
	pcut <- filter(dges, adj.P.Val < 0.05) %>% with(P.Val) %>% max()
	labup <- filter(dvol, direction == 'up') #%>% slice(1:10)
	labdn <- filter(dvol, direction == 'dn') #%>% slice(1:10)

	# unique(direction) should give up dn lowdiff nonsig (not necessarily in this order)
	# up: top right box
	# dn: top left box
	# lowdiff (above pcut but low log FC) top middle box
	# nonsig (below pcut, any logFC) all three bottom boxes

	dvol_plot <- ggplot(dvol, aes(x = logFC, y = -log10(P.Val))) +
	geom_point(aes(color = direction, fill = direction)) +
	# !!! color maps are based on unique values in X column of df where X is mentioned in color = X
	ggrepel::geom_text_repel(
	aes(label = gene_name),
	data = labup,
	fontface = 'italic',
	max.overlaps = 16 # 12
	) +
	ggtitle(title) +
	ggrepel::geom_text_repel(aes(label = gene_name), data = labdn, fontface = 'italic') +
	scale_color_manual(values = c(
	"up" = 'red',
	"dn" = 'blue',
	"lowdiff" = 'grey',
	"nonsig" = 'grey'
	)) +
	geom_vline(xintercept = 1, linetype = 'dotted') + # diff log cpm > 1 = upregulated
	geom_vline(xintercept = -1, linetype = 'dotted') + # diff log cpm < 1 = downregulated
	geom_hline(yintercept = -log10(pcut), linetype = 'dotted') +
	ylab(as.expression(bquote(paste(-log[10], 'P')))) +
	theme_classic(base_size = 15) +
	theme(legend.position = 'none') +
	theme(plot.title = element_text(hjust = 0.5)) +
	theme(
	axis.title = element_text(face = "bold", size = "14"),
	axis.text = element_text(face = "bold", size = "14"),
	legend.text = element_text(face = "bold", size = "11"),
	legend.title = element_blank(),
	panel.grid.major = element_blank(),
	panel.grid.minor = element_blank(),
	plot.title = element_text(face = "bold", size = 10)
	)
	if (!(path == "")) {
	print(sprintf("Saving plot to path: %s", path))
	ggplot2::ggsave(path, dvol_plot, width = 8, height = 8)
	}
	}

	get_ora_ids <- function(dges, path) {
	# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1)
	ora <- data.frame(ens_gene_id = dges$ens_gene_id)
	if (!(path == "")) {
	write.table(
	ora$ens_gene_id,
	path,
	sep = "\t",
	row.names = FALSE,
	col.names = FALSE,
	quote = FALSE
	)
	}
	}

	get_gsea_rnk <- function(dges, path) {
	# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1)
	gsea <- data.frame(
	ens_gene_id = dges$ens_gene_id,
	rankings = sign(dges$logFC) * (-log10(dges$P.Val))
	)
	write.table(
	gsea,
	path,
	sep = "\t",
	row.names = FALSE,
	col.names = FALSE,
	quote = FALSE
	)
	}

	get_sig_dges <- function(dges) {
	sig <- dges %>% filter(abs(logFC) >= 1) %>% filter(adj.P.Val < 0.05)
	return(sig)
	}

	get_pig_genes <- function() {
	mart <- useMart("ENSEMBL_MART_ENSEMBL", host = "https://may2024.archive.ensembl.org/")
	pigMart <- useMart("ENSEMBL_MART_ENSEMBL",
	dataset = "sscrofa_gene_ensembl",
	host =
	"https://may2024.archive.ensembl.org/")

	atts <- listAttributes(pigMart) # list all attributes
	piggenes <- getBM(
	c(
	'ensembl_gene_id',
	'external_gene_name',
	'hsapiens_homolog_ensembl_gene',
	'hsapiens_homolog_associated_gene_name'
	),
	mart = pigMart
	) %>% dplyr::select(
	ens_gene_id = ensembl_gene_id,
	human_ensembl_gene_id = hsapiens_homolog_ensembl_gene,
	pig_gene_name = external_gene_name,
	human_gene_name = hsapiens_homolog_associated_gene_name
	) %>%
	filter(!(human_ensembl_gene_id == ""))
	}