|
rm(list=ls(all=TRUE)) |
|
|
|
### ============ 0. Installation and setup ========= |
|
### ============ Install packages from Bioconductor ======== |
|
# source("https://bioconductor.org/biocLite.R") |
|
# biocLite(c("edgeR")) |
|
library("edgeR") |
|
|
|
### ============ Declare directories ========= |
|
BASE_DIR = "/Documents/data/TCGA" # Change to suit your directory |
|
TCGAOV_data_FILE = file.path(BASE_DIR, "TCGAOV_data.rda") |
|
TCGAOV_EM_expression_FILE = file.path(BASE_DIR, |
|
"MesenchymalvsImmunoreactive_RNAseq_expression.txt") |
|
TCGAOV_EM_phenotype_FILE = file.path(BASE_DIR, |
|
"MesenchymalvsImmunoreactive_RNAseq_phenotype.cls") |
|
|
|
### ============ Declare samples ========= |
|
CATEGORY_BASELINE = "Immunoreactive" |
|
CATEGORY_TEST = "Mesenchymal" |
|
|
|
### ============ Load DGEList ========= |
|
load(TCGAOV_data_FILE) |
|
|
|
### ============ Filter and Normalize =============== |
|
index_BASELINE = TCGAOV_data$samples$group == CATEGORY_BASELINE |
|
index_TEST = TCGAOV_data$samples$group == CATEGORY_TEST |
|
index_BASELINE_TEST = which(index_BASELINE | index_TEST) |
|
N_BASELINE = sum(index_BASELINE) |
|
N_TEST = sum(index_TEST) |
|
|
|
### Subset DGEList for (B)ASELINE and (T)EST samples |
|
TCGAOV_data_BT = TCGAOV_data[,index_BASELINE_TEST] |
|
|
|
### Filter based on gene counts |
|
row_with_mincount = rowSums(cpm(TCGAOV_data_BT) > 10) >= min(N_BASELINE, |
|
N_TEST) |
|
|
|
TCGAOV_filtered = TCGAOV_data_BT[row_with_mincount, , keep.lib.sizes=FALSE] |
|
TCGAOV_normalized_TMM = calcNormFactors(TCGAOV_filtered, method="TMM") |
|
|
|
### ============ Format expression =============== |
|
### Work with normalized counts per million (CPM) |
|
TCGAOV_counts_cpm <- cpm(TCGAOV_normalized_TMM, normalized.lib.size=TRUE) |
|
|
|
### Merge the DGEList variables 'genes' and 'counts' (as data.frame) |
|
TCGAOV_merged = merge(TCGAOV_normalized_TMM$genes, |
|
data.frame(TCGAOV_counts_cpm, check.names = FALSE), |
|
by="row.names", |
|
all=FALSE) |
|
|
|
### Data frame: 'external_gene_name', 'ensembl_gene_id' and sample IDs |
|
TCGAOV_EM_expression = cbind(TCGAOV_merged[,c('external_gene_name', |
|
'ensembl_gene_id')], |
|
TCGAOV_merged[,-(1:8)]) |
|
|
|
### Rename columns |
|
colnames(TCGAOV_EM_expression)[1] <- "NAME" |
|
colnames(TCGAOV_EM_expression)[2] <- "DESCRIPTION" |
|
|
|
### Write |
|
write.table(TCGAOV_EM_expression, |
|
TCGAOV_EM_expression_FILE, |
|
col.name=TRUE, |
|
sep="\t", |
|
row.names=FALSE, |
|
quote=FALSE) |
|
|
|
### ============ Format phenotypes =============== |
|
### Get the 'Mesenchymal' and 'Immunoreactive' |
|
N_samples = dim(TCGAOV_normalized_TMM)[2] |
|
N_classes = 2 |
|
|
|
### No. samples, No. classes, 1 |
|
cat(paste(N_samples, N_classes, "1", sep = " "), |
|
file = TCGAOV_EM_phenotype_FILE, |
|
sep = "\n", |
|
append = FALSE) |
|
|
|
### Class label names |
|
cat(paste("#", CATEGORY_BASELINE, CATEGORY_TEST, sep = " "), |
|
file = TCGAOV_EM_phenotype_FILE, |
|
sep = "\n", |
|
append = TRUE) |
|
|
|
### Sample class labels |
|
cat(paste(TCGAOV_normalized_TMM$samples$group, collapse = "\t"), |
|
file = TCGAOV_EM_phenotype_FILE, |
|
sep = "\n", |
|
append = TRUE) |