Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.
Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.
Group-based CPM filtering for RNA-seq
library("edgeR")
# Reading in raw counts matrix
raw <- read.table(
file = params$raw,
sep = "\t",
header = TRUE,
row.names = 1,
quote = ""
)
# Reading in sample sheet,
# contains this TSV file
# contains two columns:
# Sample and Group
groupinfo <- read.table(
file = params$group,
sep = "\t",
header = TRUE,
stringsAsFactors = TRUE
)
rownames(groupinfo) = make.names(groupinfo$Sample)
# Reorder the columns of the counts
# matrix to match the order of the
# Sample column of groupinfo TSV,
# Samples with names that do not
# match are dropped
grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0)
groupinfo <- groupinfo[grp_idx,]
# Create DGEList object with edgeR
group <- as.factor(groupinfo$Group)
deg <- edgeR::DGEList(counts = raw, group = group)
# Calculate CPM to apply group-based
# CPM filtering. This filter is used
# to remove lowly expressed genes where
# a gene must have a CPM of 0.5 in at
# least 50% of the samples in at least
# half of the groups.
cpm_threshold <- 0.5
n_genes_before_filtering <- nrow(raw)
cpm_matrix <- edgeR::cpm(raw)
group_levels <- levels(group)
n_groups <- length(group_levels)
samples_per_group <- table(group)
half_samples_per_group <- ceiling(samples_per_group / 2)
half_groups <- ceiling(n_groups / 2)
# Gene must have CPM>=0.5 in 50%
# of the samples per a given group
expr_by_group <- sapply(group_levels, function(g) {
samples_in_group <- which(group == g)
rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g]
})
# Gene must have a CPM>=0.5 in 50%
# of the groups
keep_genes <- rowSums(expr_by_group) >= half_groups
# Filter lowly expressed genes
# Gene must have CPM>=0.5 in 50%
# samples-per-group in 50% of the
# Groups. The actual filtering
# uses CPM values rather than counts
# in order to avoid biases to
# samples with large library sizes.
keep_genes <- rowSums(expr_by_group) >= half_groups
# Recaluate new lib.sizes after filtering
deg <- deg[keep_genes, , keep.lib.sizes = FALSE]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment