skchronicles · July 25, 2025 15:46
diff --git a/rna-seq_group-based_cpm_filtering.R b/rna-seq_group-based_cpm_filtering.R
 library("edgeR")

 # Reading in raw counts matrix
 raw <- read.table(
    file = params$raw,
    sep = "\t",
    header = TRUE,
    row.names = 1,
    quote = ""
 )

 # Reading in sample sheet,
 # contains this TSV file
 # contains two columns:
 # Sample and Group
 groupinfo <- read.table(
    file = params$group,
    sep = "\t",
    header = TRUE,
    stringsAsFactors = TRUE
 )
 rownames(groupinfo) = make.names(groupinfo$Sample)

 # Reorder the columns of the counts
 # matrix to match the order of the
 # Sample column of groupinfo TSV,
 # Samples with names that do not
 # match are dropped
 grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0)
 groupinfo <- groupinfo[grp_idx,]

 # Create DGEList object with edgeR
 group <- as.factor(groupinfo$Group)
 deg <- edgeR::DGEList(counts = raw, group = group)

 # Calculate CPM to apply group-based
 # CPM filtering. This filter is used
 # to remove lowly expressed genes where
 # a gene must have a CPM of 0.5 in at
 # least 50% of the samples in at least
 # half of the groups.
 cpm_threshold <- 0.5
 n_genes_before_filtering <- nrow(raw)
 cpm_matrix <- edgeR::cpm(raw)
 group_levels <- levels(group)
 n_groups <- length(group_levels)
 samples_per_group <- table(group)
 half_samples_per_group <- ceiling(samples_per_group / 2)
 half_groups <- ceiling(n_groups / 2)
 # Gene must have CPM>=0.5 in 50%
 # of the samples per a given group
 expr_by_group <- sapply(group_levels, function(g) {
  samples_in_group <- which(group == g)
  rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g]
 })
 # Gene must have a CPM>=0.5 in 50%
 # of the groups
 keep_genes <- rowSums(expr_by_group) >= half_groups
 # Filter lowly expressed genes
 # Gene must have CPM>=0.5 in 50% 
 # samples-per-group in 50% of the
 # Groups. The actual filtering
 # uses CPM values rather than counts
 # in order to avoid biases to
 # samples with large library sizes.
 keep_genes <- rowSums(expr_by_group) >= half_groups
 # Recaluate new lib.sizes after filtering
 deg <- deg[keep_genes, , keep.lib.sizes = FALSE]
	library("edgeR")

	# Reading in raw counts matrix
	raw <- read.table(
	file = params$raw,
	sep = "\t",
	header = TRUE,
	row.names = 1,
	quote = ""
	)

	# Reading in sample sheet,
	# contains this TSV file
	# contains two columns:
	# Sample and Group
	groupinfo <- read.table(
	file = params$group,
	sep = "\t",
	header = TRUE,
	stringsAsFactors = TRUE
	)
	rownames(groupinfo) = make.names(groupinfo$Sample)

	# Reorder the columns of the counts
	# matrix to match the order of the
	# Sample column of groupinfo TSV,
	# Samples with names that do not
	# match are dropped
	grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0)
	groupinfo <- groupinfo[grp_idx,]

	# Create DGEList object with edgeR
	group <- as.factor(groupinfo$Group)
	deg <- edgeR::DGEList(counts = raw, group = group)

	# Calculate CPM to apply group-based
	# CPM filtering. This filter is used
	# to remove lowly expressed genes where
	# a gene must have a CPM of 0.5 in at
	# least 50% of the samples in at least
	# half of the groups.
	cpm_threshold <- 0.5
	n_genes_before_filtering <- nrow(raw)
	cpm_matrix <- edgeR::cpm(raw)
	group_levels <- levels(group)
	n_groups <- length(group_levels)
	samples_per_group <- table(group)
	half_samples_per_group <- ceiling(samples_per_group / 2)
	half_groups <- ceiling(n_groups / 2)
	# Gene must have CPM>=0.5 in 50%
	# of the samples per a given group
	expr_by_group <- sapply(group_levels, function(g) {
	samples_in_group <- which(group == g)
	rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g]
	})
	# Gene must have a CPM>=0.5 in 50%
	# of the groups
	keep_genes <- rowSums(expr_by_group) >= half_groups
	# Filter lowly expressed genes
	# Gene must have CPM>=0.5 in 50%
	# samples-per-group in 50% of the
	# Groups. The actual filtering
	# uses CPM values rather than counts
	# in order to avoid biases to
	# samples with large library sizes.
	keep_genes <- rowSums(expr_by_group) >= half_groups
	# Recaluate new lib.sizes after filtering
	deg <- deg[keep_genes, , keep.lib.sizes = FALSE]