Created
July 25, 2025 15:46
-
-
Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.
Group-based CPM filtering for RNA-seq
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("edgeR") | |
# Reading in raw counts matrix | |
raw <- read.table( | |
file = params$raw, | |
sep = "\t", | |
header = TRUE, | |
row.names = 1, | |
quote = "" | |
) | |
# Reading in sample sheet, | |
# contains this TSV file | |
# contains two columns: | |
# Sample and Group | |
groupinfo <- read.table( | |
file = params$group, | |
sep = "\t", | |
header = TRUE, | |
stringsAsFactors = TRUE | |
) | |
rownames(groupinfo) = make.names(groupinfo$Sample) | |
# Reorder the columns of the counts | |
# matrix to match the order of the | |
# Sample column of groupinfo TSV, | |
# Samples with names that do not | |
# match are dropped | |
grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0) | |
groupinfo <- groupinfo[grp_idx,] | |
# Create DGEList object with edgeR | |
group <- as.factor(groupinfo$Group) | |
deg <- edgeR::DGEList(counts = raw, group = group) | |
# Calculate CPM to apply group-based | |
# CPM filtering. This filter is used | |
# to remove lowly expressed genes where | |
# a gene must have a CPM of 0.5 in at | |
# least 50% of the samples in at least | |
# half of the groups. | |
cpm_threshold <- 0.5 | |
n_genes_before_filtering <- nrow(raw) | |
cpm_matrix <- edgeR::cpm(raw) | |
group_levels <- levels(group) | |
n_groups <- length(group_levels) | |
samples_per_group <- table(group) | |
half_samples_per_group <- ceiling(samples_per_group / 2) | |
half_groups <- ceiling(n_groups / 2) | |
# Gene must have CPM>=0.5 in 50% | |
# of the samples per a given group | |
expr_by_group <- sapply(group_levels, function(g) { | |
samples_in_group <- which(group == g) | |
rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g] | |
}) | |
# Gene must have a CPM>=0.5 in 50% | |
# of the groups | |
keep_genes <- rowSums(expr_by_group) >= half_groups | |
# Filter lowly expressed genes | |
# Gene must have CPM>=0.5 in 50% | |
# samples-per-group in 50% of the | |
# Groups. The actual filtering | |
# uses CPM values rather than counts | |
# in order to avoid biases to | |
# samples with large library sizes. | |
keep_genes <- rowSums(expr_by_group) >= half_groups | |
# Recaluate new lib.sizes after filtering | |
deg <- deg[keep_genes, , keep.lib.sizes = FALSE] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment