Skip to content

Instantly share code, notes, and snippets.

@hbhutta
Last active December 23, 2024 18:22
Show Gist options
  • Save hbhutta/acc9606e526bc100c721af0646d9940a to your computer and use it in GitHub Desktop.
Save hbhutta/acc9606e526bc100c721af0646d9940a to your computer and use it in GitHub Desktop.
A suite of R functions for repetitive bulk RNAseq procedures which I wrote while working as a data analyst at a hospital.
library(limma)
library(edgeR)
library(sva)
library(ggplot2)
library(tidyr)
library(broom)
library(dplyr)
library(ggpubr)
library(biomaRt)
options(stringsAsFactors = F)
sig_info <- function(dges) {
print(sprintf(
"#. of DGEs that have adjusted p-value < 0.05: %s",
sum(dges$adj.P.Val < 0.05)
))
print(sprintf(
"#. of DGEs that have adjusted p-value < 0.05 AND abs(logFC) >= 1: %s",
sum(dges$adj.P.Val < 0.05 & abs(dges$logFC) >= 1)
))
}
annotate <- function(dges, annotation) {
annotation <- rename(annotation, ens_gene_id = gene_id)
dges <- dges %>% inner_join(annotation, by = join_by(ens_gene_id))
return(dges)
}
plot_PCA <- function(pca, lcpm, title, path, group_or_batch) {
ggsave(
path,
ggbiplot::ggbiplot(
pca,
obs.scale = 1,
var.scale = 1,
labels = colnames(lcpm),
ellipse = TRUE,
ellipse.linewidth = 1,
ellipse.fill = TRUE,
ellipse.alpha = 0.2,
varname.size = TRUE,
pc.biplot = FALSE,
var.axes = FALSE,
circle = FALSE,
group = group_or_batch
) +
theme_bw() +
theme(legend.position = 'bottom') +
theme(plot.title = element_text(hjust = 0.5)) + # Center title
scale_color_manual(values = c("royalblue2", "red3")) + # ellipse border colors
scale_fill_manual(values = c("cyan", "red1")) + # ellipse fill color
ggtitle(title) +
labs(
x = paste0("PC1 (", round(summary(pca)$importance[2, 1] * 100, 2), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2, 2] * 100, 2), "%)"),
color = "Treatment",
) +
theme(
axis.title = element_text(face = "bold", size = "14"),
axis.text = element_text(face = "bold", size = "14"),
legend.text = element_text(face = "bold", size = "11"),
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
guides(color = guide_legend(nrow = 1, byrow = TRUE)),
height = 8,
width = 8
)
}
get_filtered_lcpm <- function(samples, group) {
if (length(unique(group)) < 2) {
dge <- DGEList(counts = samples)
keep <- filterByExpr(dge)
dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
dge <- calcNormFactors(dge)
v <- voom(dge, plot = T)
lcpm <- v$E
return(lcpm)
}
# 2-step filtering
dge <- DGEList(counts = samples, group = group)
design <- model.matrix( ~ group)
keep <- filterByExpr(dge, design = design)
dge <- dge[keep, , keep.lib.sizes = FALSE] # First filtering step
dge <- calcNormFactors(dge)
v <- voom(dge, plot = T)
lcpm <- v$E
group_labels <- unique(group)
print(group_labels)
n_1 <- length(group[group == group_labels[1]])
n_2 <- length(group[group == group_labels[2]])
size_of_smallest_group <- ifelse(n_1 < n_2, n_1, n_2)
keep <- rownames(lcpm)[!(rowSums(lcpm < 1) > size_of_smallest_group)] # Second filtering step
lcpm <- lcpm[keep, ]
print(nrow(lcpm))
return(lcpm)
}
get_dvol <- function(dges) {
dvol <- dges %>%
mutate(direction = ifelse(!(adj.P.Val < 0.05), "nonsig", ifelse(
logFC >= 1, "up", ifelse(logFC <= -1, "dn", "lowdiff")
)))
print("Table summary of DGEs direction:")
print(table(dvol$direction))
return(dvol)
}
perform_DGE_limma <- function(lcpm, group, annotation) {
design <- model.matrix(~group)
fit <- lmFit(lcpm, design = design)
toptab <- topTable(
fit = eBayes(fit),
coef = ncol(design),
number = Inf,
)
toptab <- toptab %>%
mutate(ens_gene_id = rownames(toptab)) %>%
dplyr::select(-t, -B, -AveExpr, P.Val = P.Value) #%>% # we do not need the information from these columns
toptab <- annotate(toptab, annotation)
print(sprintf(
"Found %d statistically significant DGEs using limma::lmFit",
nrow(toptab)
))
sig_info(toptab)
return(toptab)
}
perform_DGE_wilcoxon <- function(lcpm, group, annotation) {
gexp_by_gene_id_and_sample <- as.data.frame(t(lcpm)) %>%
mutate(sample = row.names(.)) %>%
gather(ens_gene_id, gene_exp, -sample) # Exclude sample from gathering
metadata <- data.frame(sample = colnames(lcpm), from_group = group)
# Annotate using metadata
gexp_with_group_info <- inner_join(gexp_by_gene_id_and_sample, metadata, by = join_by(sample))
summ_stats <- gexp_with_group_info %>%
group_by(ens_gene_id, from_group) %>%
summarise(mean_gene_exp = mean(gene_exp), .groups = 'drop') %>%
as.data.frame() %>%
pivot_wider(names_from = from_group, values_from = mean_gene_exp)
print(
sprintf(
"Treating group label %s as disease/reference group in logFC calculation",
colnames(summ_stats)[3]
)
)
print(
sprintf(
"Calculating logFC as mean_expr(%s) - mean_expr(%s) for each gene...",
colnames(summ_stats)[3],
colnames(summ_stats)[2]
)
)
# if (half) {
# print("Halving logFC")
# summ_stats <- summ_stats %>% mutate(logFC = (.[[3]] - .[[2]]) / 2)
# } else {
# summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
# }
summ_stats <- summ_stats %>% mutate(logFC = .[[3]] - .[[2]])
print("Calculated logFC")
wilcox <- gexp_with_group_info %>%
group_by(ens_gene_id) %>%
do(tidy(wilcox.test(gene_exp ~ from_group, data = .))) %>%
as.data.frame() %>%
rename(P.Val = p.value) %>%
mutate(adj.P.Val = p.adjust(P.Val, method = "BH")) %>% # benjamini-hochberg MH correction
inner_join(summ_stats, by = join_by(ens_gene_id))
dges <- wilcox #%>%
#dplyr::select(ens_gene_id, P.Val, adj.P.Val, logFC)
if (!(is.null(annotation))) {
dges <- annotate(dges, annotation)
}
print(sprintf("Found %d statistically significant DGEs using Wilcoxon", nrow(dges)))
sig_info(dges)
return(dges)
}
plot_dvol <- function(dges, path, title) {
dvol <- get_dvol(dges)
print(colnames(dvol))
pcut <- filter(dges, adj.P.Val < 0.05) %>% with(P.Val) %>% max()
labup <- filter(dvol, direction == 'up') #%>% slice(1:10)
labdn <- filter(dvol, direction == 'dn') #%>% slice(1:10)
# unique(direction) should give up dn lowdiff nonsig (not necessarily in this order)
# up: top right box
# dn: top left box
# lowdiff (above pcut but low log FC) top middle box
# nonsig (below pcut, any logFC) all three bottom boxes
dvol_plot <- ggplot(dvol, aes(x = logFC, y = -log10(P.Val))) +
geom_point(aes(color = direction, fill = direction)) +
# !!! color maps are based on unique values in X column of df where X is mentioned in color = X
ggrepel::geom_text_repel(
aes(label = gene_name),
data = labup,
fontface = 'italic',
max.overlaps = 16 # 12
) +
ggtitle(title) +
ggrepel::geom_text_repel(aes(label = gene_name), data = labdn, fontface = 'italic') +
scale_color_manual(values = c(
"up" = 'red',
"dn" = 'blue',
"lowdiff" = 'grey',
"nonsig" = 'grey'
)) +
geom_vline(xintercept = 1, linetype = 'dotted') + # diff log cpm > 1 = upregulated
geom_vline(xintercept = -1, linetype = 'dotted') + # diff log cpm < 1 = downregulated
geom_hline(yintercept = -log10(pcut), linetype = 'dotted') +
ylab(as.expression(bquote(paste(-log[10], 'P')))) +
theme_classic(base_size = 15) +
theme(legend.position = 'none') +
theme(plot.title = element_text(hjust = 0.5)) +
theme(
axis.title = element_text(face = "bold", size = "14"),
axis.text = element_text(face = "bold", size = "14"),
legend.text = element_text(face = "bold", size = "11"),
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 10)
)
if (!(path == "")) {
print(sprintf("Saving plot to path: %s", path))
ggplot2::ggsave(path, dvol_plot, width = 8, height = 8)
}
}
get_ora_ids <- function(dges, path) {
# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1)
ora <- data.frame(ens_gene_id = dges$ens_gene_id)
if (!(path == "")) {
write.table(
ora$ens_gene_id,
path,
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
}
}
get_gsea_rnk <- function(dges, path) {
# Assuming dges are significant (adj.P.Val < 0.05 and abs(logFC) > 1)
gsea <- data.frame(
ens_gene_id = dges$ens_gene_id,
rankings = sign(dges$logFC) * (-log10(dges$P.Val))
)
write.table(
gsea,
path,
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
}
get_sig_dges <- function(dges) {
sig <- dges %>% filter(abs(logFC) >= 1) %>% filter(adj.P.Val < 0.05)
return(sig)
}
get_pig_genes <- function() {
mart <- useMart("ENSEMBL_MART_ENSEMBL", host = "https://may2024.archive.ensembl.org/")
pigMart <- useMart("ENSEMBL_MART_ENSEMBL",
dataset = "sscrofa_gene_ensembl",
host =
"https://may2024.archive.ensembl.org/")
atts <- listAttributes(pigMart) # list all attributes
piggenes <- getBM(
c(
'ensembl_gene_id',
'external_gene_name',
'hsapiens_homolog_ensembl_gene',
'hsapiens_homolog_associated_gene_name'
),
mart = pigMart
) %>% dplyr::select(
ens_gene_id = ensembl_gene_id,
human_ensembl_gene_id = hsapiens_homolog_ensembl_gene,
pig_gene_name = external_gene_name,
human_gene_name = hsapiens_homolog_associated_gene_name
) %>%
filter(!(human_ensembl_gene_id == ""))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment