Created
October 20, 2016 11:23
-
-
Save mlist/8a514d6e312e7d7efde4ebd0390e8133 to your computer and use it in GitHub Desktop.
Download all BLUEPRINT gene expression data and format as numeric matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load dependencies | |
# install DeepBlueR from bioconductor | |
# http://bioconductor.org/packages/release/bioc/html/DeepBlueR.html | |
library(DeepBlueR) | |
library(dplyr) | |
library(tidyr) | |
# List all BLUEPRINT samples | |
blueprint_samples <- deepblue_list_samples( | |
extra_metadata = list("source" = "BLUEPRINT Epigenome")) | |
# Extract their ids | |
blueprint_samples_ids <- deepblue_extract_ids(blueprint_samples) | |
# Select gene expression data. We assign gene names using Gencode 22 | |
gene_exprs_query <- deepblue_select_gene_expressions(sample_ids = | |
blueprint_samples_ids, gene_model = "gencode v22") | |
# We request the data and define the output format | |
request = deepblue_get_regions(query_id = gene_exprs_query, | |
"@GENE_ID(gencode v22),FPKM,@BIOSOURCE,@SAMPLE_ID") | |
# We download the data | |
gene_regions <- deepblue_download_request_data(request) | |
# We retain a table mapping sample ids to bisources | |
sample_names <- dplyr::select(gene_regions, `@BIOSOURCE`, `@SAMPLE_ID`) %>% | |
dplyr::distinct() | |
# We filter out duplicated gene entries | |
genes_one_sample <- dplyr::filter(gene_regions, `@SAMPLE_ID` == "s10678") | |
duplicated_genes <- genes_one_sample[ | |
which(duplicated(genes_one_sample$`@GENE_ID(gencode v22)`)), | |
"@GENE_ID(gencode v22)"] | |
# We convert the gene expression from a list to a data frame and subsequently... | |
genes_matrix = dplyr::filter(gene_regions, | |
!(`@GENE_ID(gencode v22)` %in% duplicated_genes)) %>% | |
dplyr::select(-`@BIOSOURCE`) %>% | |
tidyr::spread(key = `@SAMPLE_ID`, value = FPKM) | |
# ...to a numeric matrix | |
genes <- genes_matrix[,1] | |
genes_matrix <- data.matrix(genes_matrix[,-1]) | |
rownames(genes_matrix) <- genes | |
### OUTPUT | |
### genes_matrix : The gene expression matrix for all 276 BLUEPRINT samples | |
### sample_names : A mapping table from sample id to cell type / biosource |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment