Last active
January 20, 2017 04:12
-
-
Save dmyersturnbull/cfb7dd5d6b53e1ec9437d65581a32f34 to your computer and use it in GitHub Desktop.
Extract metadata from Gene Expression Omnibus and other NCBI resources to match up identifiers, etc. For PMID:26060301.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Douglas Myers-Turnbull wrote this while at UCSF. Because of this, the list of copyright owners is unknown and is not licensed (sorry!). | |
from dl_and_rezip import dl_and_rezip # see https://gist.github.com/dmyersturnbull/a6591676fc98da355c5250d48e26844e | |
from lines import lines | |
from typing import Mapping, Iterable, Optional, Iterator, Callable | |
import os | |
import warnings | |
import pandas as pd | |
import re | |
def unique(x: Iterable[str], name: str='<unknown>') -> Optional[str]: | |
"""Get the sole element in x or None if the size is not 1. Warns if multiple elements are in x but not if x is empty.""" | |
if len(x) == 1: return x[0] | |
elif len(x) > 1: warnings.warn("{} results for {}".format(len(x), name)) | |
return None # if len is 0, don't warn because the caller will know anyway | |
def grab(filename: str, starts: Mapping[str, str], strip: bool=True) -> Mapping[str, Iterable[str]]: | |
"""Example: | |
Contents of filename.txt: | |
abcdAString | |
abcdADifferentString | |
xyzYetAnotherString | |
starts: {'AB': 'abcd', 'XY': 'xyz'} | |
returns: {'AB': ['AString', 'ADifferentString'], 'xyz': 'YetAnotherString'} | |
""" | |
values = {k: [] for k, v in starts.items()} | |
for line in lines(filename): | |
for start_key, start_value in starts.items(): | |
if line.startswith(start_value): | |
values[start_key].append(line[len(start_value): ].strip()) | |
return values | |
default_gsm_starts = {'chip_id': '!Sample_characteristics_ch1 = c1 chip id: ', | |
'age': '!Sample_characteristics_ch1 = age:', | |
'srx': '!Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=', | |
'supplemental_file': '!Sample_supplementary_file_1 = '} | |
#'cell_type': '!Sample_characteristics_ch1 = cell type:', | |
# 'tissue': '!Sample_characteristics_ch1 = tissue:', | |
def generate_rows(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> Iterator[str]: | |
"""From a GSE accession, extract the ends of lines in the GSE text file that begin with a string in the values of gsm_starts.""" | |
if gsm_starts is None: gsm_starts = default_gsm_starts | |
for gsm in grab(gse + '.txt.gz', {'sample': '!Series_sample_id = '})['sample']: | |
dl_and_rezip("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&form=text".format(gsm), gsm + '.txt') | |
items = {k: unique(v, name=k) for k, v in grab(gsm + '.txt.gz', gsm_starts).items()} | |
yield (gsm, *items.values()) | |
def build_table(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> pd.DataFrame: | |
"""Iterate over results from generate_rows to build a Pandas DataFrame.""" | |
if gsm_starts is None: gsm_starts = default_gsm_starts | |
return pd.DataFrame.from_records(list(generate_rows(gse, gsm_starts=gsm_starts)), columns=['gsm'] + list(gsm_starts.keys())).set_index('gsm') | |
df = build_table('GSE67835') | |
quake_pattern = re.compile('(GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz') | |
def quake(url: str) -> str: | |
"""Extract an ID like 1772078217_C03 from a supplemental filename that looks like, | |
ex: ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1658nnn/GSM1658365/suppl/GSM1658365_nochipID4.C95.csv.gz | |
Specifically, return group1 + '_' + group2 from (GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz | |
""" | |
match = quake_pattern.search(url) | |
if match is None: return None | |
# TODO: verify that group(1) matches the GSM in the index | |
return match.group(2) + '_' + match.group(3) | |
df['quake_id'] = df['supplemental_file'].map(quake) | |
df = df.drop('supplemental_file', axis=1) | |
srr_pattern = re.compile('SRR\d+') | |
def grab_srr(srx: str) -> Optional[str]: | |
"""Download an SRA HTML file and extract the first substring matching SRR\d+ | |
Ex: grab_srr('SRX995861') | |
""" | |
dl_and_rezip('https://www.ncbi.nlm.nih.gov/sra/' + srx, srx + '.txt') | |
match = srr_pattern.search(''.join(lines(srx + '.txt.gz'))) | |
return None if match is None else match.group(0) | |
df['srr'] = df['srx'].map(grab_srr) | |
df = df.drop('chip_id', axis=1) | |
# This is a very bad-form unit test :( | |
assert ['SRR1975000'] == list(df.reset_index()[df.reset_index()['gsm'] == 'GSM1658358']['srr']) | |
df.to_csv('samples.csv.gz', compression='gzip') | |
df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment