Last active
August 10, 2020 12:18
-
-
Save oganm/4dc620d75a893608b4862e11a31c0050 to your computer and use it in GitHub Desktop.
yeast download for jesse
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(gemmaAPI) | |
# identify all yeast datasets | |
yeastStudies = taxonInfo('yeast',request = 'datasets',limit = 0) | |
studyIDs = yeastStudies %>% purrr::map_chr('id') | |
# get metadata for yeast studies | |
yeastMetadata = studyIDs %>% lapply(compileMetadata,outputType = 'list') | |
quality = yeastMetadata %>% purrr::map('experimentData') %>% purrr::map_dbl('geeq.qualityScore') | |
quality %>% hist() | |
# count datasets for platforms. note that there are only 3 experiments between | |
# the most common (GPL2529, 58 experiments, 4618 annotated genes, 10928 probes) | |
# and the second most common (GPL90, 55 experiments, 5549 annotated genes, 9335 | |
# probes). don't have much experience with yeast platforms myself so can't | |
# comment on their relative quality. This gets the data for both of these, | |
# saved to different directories | |
platforms = yeastMetadata %>% | |
purrr::map('experimentData') %>% | |
purrr::map_chr('platformName') | |
popularPlatforms = platforms %>% | |
table %>% | |
sort(decreasing = TRUE) %>% | |
names %>% {.[1:2]} | |
# filter metadata do only include the datasets with the popularPlatforms | |
yeastMetadata = yeastMetadata[platforms %in% popularPlatforms] | |
# get platforms for each dataset | |
platforms = yeastMetadata %>% | |
purrr::map('experimentData') %>% | |
purrr::map_chr('platformName') | |
studyIDs = yeastMetadata %>% | |
purrr::map('experimentData') %>% purrr::map_chr('datasetID') | |
diffs = studyIDs %>% | |
lapply(datasetInfo,request = 'differential', offset =0, limit = 0) | |
differentials = diffs %>% purrr::map(names) | |
# split the platforms in case you want to analyze them separately or ignore one of them | |
popularPlatforms %>% lapply(function(platform){ | |
studyIDs = yeastMetadata[platforms == platform] %>% | |
purrr::map('experimentData') %>% purrr::map_chr('datasetID') | |
# get differential IDs from the datasets | |
differentials = studyIDs %>% | |
lapply(datasetInfo,request = 'differential', offset =0, limit = 0) %>% | |
purrr::map(names) | |
# download each differential for each dataset | |
dir.create(platform, showWarnings = FALSE) | |
names(differentials) %>% lapply(function(x){ | |
differentials[[x]] %>% lapply(function(id){ | |
print(x) | |
# these two studies returned errors. | |
if(x %in% c('GSE17716','GSE40399')){ | |
return(NULL) | |
} | |
# if is safe to remove | |
if(!file.exists(glue::glue('{platform}/{x}_{id}'))){ | |
datasetInfo(x, request = 'degs',differential = id,file = glue::glue('{platform}/{x}_{id}'),return=FALSE) | |
} else{ | |
print('exists already. skipping') | |
} | |
}) | |
}) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment