Last active
December 23, 2024 18:22
-
-
Save hbhutta/acc9606e526bc100c721af0646d9940a to your computer and use it in GitHub Desktop.
A suite of R functions for repetitive bulk RNAseq procedures which I wrote while working as a data analyst at a hospital.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(limma) | |
library(edgeR) | |
library(sva) | |
library(ggplot2) | |
library(tidyr) | |
library(broom) | |
library(dplyr) | |
library(ggpubr) | |
library(biomaRt) | |
options(stringsAsFactors = F) | |
sig_info <- function(dges) { | |
print(sprintf( | |
"#. of DGEs that have adjusted p-value < 0.05: %s", | |
sum(dges$adj.P.Val < 0.05) | |
)) | |
print(sprintf( | |
"#. of DGEs that have adjusted p-value < 0.05 AND abs(logFC) >= 1: %s", | |
sum(dges$adj.P.Val < 0.05 & abs(dges$logFC) >= 1) | |
)) | |
} | |
annotate <- function(dges, annotation) { | |
annotation <- rename(annotation, ens_gene_id = gene_id) | |
dges <- dges %>% inner_join(annotation, by = join_by(ens_gene_id)) | |
return(dges) | |
} | |
plot_PCA <- function(pca, lcpm, title, path, group_or_batch) { | |
ggsave( | |
path, | |
ggbiplot::ggbiplot( | |
pca, | |
obs.scale = 1, | |
var.scale = 1, | |
labels = colnames(lcpm), | |
ellipse = TRUE, | |
ellipse.linewidth = 1, | |
ellipse.fill = TRUE, | |
ellipse.alpha = 0.2, | |
varname.size = TRUE, | |
pc.biplot = FALSE, | |
var.axes = FALSE, | |
circle = FALSE, | |
group = group_or_batch | |
) + | |
theme_bw() + | |
theme(legend.position = 'bottom') + | |
theme(plot.title = element_text(hjust = 0.5)) + # Center title | |
scale_color_manual(values = c("royalblue2", "red3")) + # ellipse border colors | |
scale_fill_manual(values = c("cyan", "red1")) + # ellipse fill color | |
ggtitle(title) + | |
labs( | |
x = paste0("PC1 (", round(summary(pca)$importance[2, 1] * 100, 2), "%)"), | |
y = paste0("PC2 (", round(summary(pca)$importance[2, 2] * 100, 2), "%)"), | |
color = "Treatment", | |
) + | |
theme( | |
axis.title = element_text(face = "bold", size = "14"), | |
axis.text = element_text(face = "bold", size = "14"), | |
legend.text = element_text(face = "bold", size = "11"), | |
legend.title = element_blank(), | |
panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank() | |
) + | |
guides(color = guide_legend(nrow = 1, byrow = TRUE)), | |
height = 8, | |
width = 8 | |
) | |
} | |
get_filtered_lcpm <- function(samples, group) { | |
if (length(unique(group)) < 2) { | |
dge <- DGEList(counts = samples) | |
keep <- filterByExpr(dge) | |
dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step | |
dge <- calcNormFactors(dge) | |
v <- voom(dge, plot = T) | |
lcpm <- v$E | |
return(lcpm) | |
} | |
# 2-step filtering | |
dge <- DGEList(counts = samples, group = group) | |
design <- model.matrix( ~ group) | |
keep <- filterByExpr(dge, design = design) | |
dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step | |
dge <- calcNormFactors(dge) | |
v <- voom(dge, plot = T) | |
lcpm <- v$E | |
group_labels <- unique(group) | |
print(group_labels) | |
n_1 <- length(group[group == group_labels[1]]) | |
n_2 <- length(group[group == group_labels[2]]) | |
size_of_smallest_group <- ifelse(n_1 < n_2, n_1, n_2) | |
keep <- rownames(lcpm)[!(rowSums(lcpm < 1) > size_of_smallest_group)] # Second filtering step | |
lcpm <- lcpm[keep, ] | |
print(nrow(lcpm)) | |
return(lcpm) | |
} | |
get_dvol <- function(dges) { | |
dvol <- dges %>% | |
mutate(direction = ifelse(!(adj.P.Val < 0.05), "nonsig", ifelse( | |
logFC >= 1, "up", ifelse(logFC <= -1, "dn", "lowdiff") | |
))) | |
print("Table summary of DGEs direction:") | |
print(table(dvol$direction)) | |
return(dvol) | |
} | |
perform_DGE_limma <- function(lcpm, group, annotation) { | |
design <- model.matrix(~group) | |
fit <- lmFit(lcpm, design = design) | |
toptab <- topTable( | |
fit = eBayes(fit), | |
coef = ncol(design), | |
number = Inf, | |
) | |
toptab <- toptab %>% | |
mutate(ens_gene_id = rownames(toptab)) %>% | |
dplyr::select(-t, -B, -AveExpr, P.Val = P.Value) #%>% # we do not need the information from these columns | |
toptab <- annotate(toptab, annotation) | |
print(sprintf( | |
"Found %d statistically significant DGEs using limma::lmFit", | |
nrow(toptab) | |
)) | |
sig_info(toptab) | |
return(toptab) | |
} | |
perform_DGE_wilcoxon <- function(lcpm, group, annotation) { | |
gexp_by_gene_id_and_sample <- as.data.frame(t(lcpm)) %>% | |
mutate(sample = row.names(.)) %>% | |
gather(ens_gene_id, gene_exp, -sample) # Exclude sample from gathering | |
metadata <- data.frame(sample = colnames(lcpm), from_group = group) | |
# Annotate using metadata | |
gexp_with_group_info <- inner_join(gexp_by_gene_id_and_sample, metadata, by = join_by(sample)) | |
summ_stats <- gexp_with_group_info %>% | |
group_by(ens_gene_id, from_group) %>% | |
summarise(mean_gene_exp = mean(gene_exp), .groups = 'drop') %>% | |
as.data.frame() %>% | |
pivot_wider(names_from = from_group, values_from = mean_gene_exp) | |
print( | |
sprintf( | |
"Treating group label %s as disease/reference group in logFC calculation", | |
colnames(summ_stats)[3] | |
) | |
) | |
print( | |
sprintf( | |
"Calculating logFC as mean_expr(%s) - mean_expr(%s) for each gene...", | |
colnames(summ_stats)[3], | |
colnames(summ_stats)[2] | |
) | |
) | |
# if (half) { | |
# print("Halving logFC") | |
# summ_stats <- summ_stats %>% mutate(logFC = (.[[3]] - .[[2]]) / 2) | |
# } else { | |
# summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]]) | |
# } | |
summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]]) | |
print("Calculated logFC") | |
wilcox <- gexp_with_group_info %>% | |
group_by(ens_gene_id) %>% | |
do(tidy(wilcox.test(gene_exp ~ from_group, data = .))) %>% | |
as.data.frame() %>% | |
rename(P.Val = p.value) %>% | |
mutate(adj.P.Val = p.adjust(P.Val, method = "BH")) %>% # benjamini-hochberg MH correction | |
inner_join(summ_stats, by = join_by(ens_gene_id)) | |
dges <- wilcox #%>% | |
#dplyr::select(ens_gene_id, P.Val, adj.P.Val, logFC) | |
if (!(is.null(annotation))) { | |
dges <- annotate(dges, annotation) | |
} | |
print(sprintf("Found %d statistically significant DGEs using Wilcoxon", nrow(dges))) | |
sig_info(dges) | |
return(dges) | |
} | |
plot_dvol <- function(dges, path, title) { | |
dvol <- get_dvol(dges) | |
print(colnames(dvol)) | |
pcut <- filter(dges, adj.P.Val < 0.05) %>% with(P.Val) %>% max() | |
labup <- filter(dvol, direction == 'up') #%>% slice(1:10) | |
labdn <- filter(dvol, direction == 'dn') #%>% slice(1:10) | |
# unique(direction) should give up dn lowdiff nonsig (not necessarily in this order) | |
# up: top right box | |
# dn: top left box | |
# lowdiff (above pcut but low log FC) top middle box | |
# nonsig (below pcut, any logFC) all three bottom boxes | |
dvol_plot <- ggplot(dvol, aes(x = logFC, y = -log10(P.Val))) + | |
geom_point(aes(color = direction, fill = direction)) + | |
# !!! color maps are based on unique values in X column of df where X is mentioned in color = X | |
ggrepel::geom_text_repel( | |
aes(label = gene_name), | |
data = labup, | |
fontface = 'italic', | |
max.overlaps = 16 # 12 | |
) + | |
ggtitle(title) + | |
ggrepel::geom_text_repel(aes(label = gene_name), data = labdn, fontface = 'italic') + | |
scale_color_manual(values = c( | |
"up" = 'red', | |
"dn" = 'blue', | |
"lowdiff" = 'grey', | |
"nonsig" = 'grey' | |
)) + | |
geom_vline(xintercept = 1, linetype = 'dotted') + # diff log cpm > 1 = upregulated | |
geom_vline(xintercept = -1, linetype = 'dotted') + # diff log cpm < 1 = downregulated | |
geom_hline(yintercept = -log10(pcut), linetype = 'dotted') + | |
ylab(as.expression(bquote(paste(-log[10], 'P')))) + | |
theme_classic(base_size = 15) + | |
theme(legend.position = 'none') + | |
theme(plot.title = element_text(hjust = 0.5)) + | |
theme( | |
axis.title = element_text(face = "bold", size = "14"), | |
axis.text = element_text(face = "bold", size = "14"), | |
legend.text = element_text(face = "bold", size = "11"), | |
legend.title = element_blank(), | |
panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
plot.title = element_text(face = "bold", size = 10) | |
) | |
if (!(path == "")) { | |
print(sprintf("Saving plot to path: %s", path)) | |
ggplot2::ggsave(path, dvol_plot, width = 8, height = 8) | |
} | |
} | |
get_ora_ids <- function(dges, path) { | |
# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1) | |
ora <- data.frame(ens_gene_id = dges$ens_gene_id) | |
if (!(path == "")) { | |
write.table( | |
ora$ens_gene_id, | |
path, | |
sep = "\t", | |
row.names = FALSE, | |
col.names = FALSE, | |
quote = FALSE | |
) | |
} | |
} | |
get_gsea_rnk <- function(dges, path) { | |
# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1) | |
gsea <- data.frame( | |
ens_gene_id = dges$ens_gene_id, | |
rankings = sign(dges$logFC) * (-log10(dges$P.Val)) | |
) | |
write.table( | |
gsea, | |
path, | |
sep = "\t", | |
row.names = FALSE, | |
col.names = FALSE, | |
quote = FALSE | |
) | |
} | |
get_sig_dges <- function(dges) { | |
sig <- dges %>% filter(abs(logFC) >= 1) %>% filter(adj.P.Val < 0.05) | |
return(sig) | |
} | |
get_pig_genes <- function() { | |
mart <- useMart("ENSEMBL_MART_ENSEMBL", host = "https://may2024.archive.ensembl.org/") | |
pigMart <- useMart("ENSEMBL_MART_ENSEMBL", | |
dataset = "sscrofa_gene_ensembl", | |
host = | |
"https://may2024.archive.ensembl.org/") | |
atts <- listAttributes(pigMart) # list all attributes | |
piggenes <- getBM( | |
c( | |
'ensembl_gene_id', | |
'external_gene_name', | |
'hsapiens_homolog_ensembl_gene', | |
'hsapiens_homolog_associated_gene_name' | |
), | |
mart = pigMart | |
) %>% dplyr::select( | |
ens_gene_id = ensembl_gene_id, | |
human_ensembl_gene_id = hsapiens_homolog_ensembl_gene, | |
pig_gene_name = external_gene_name, | |
human_gene_name = hsapiens_homolog_associated_gene_name | |
) %>% | |
filter(!(human_ensembl_gene_id == "")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment