Skip to content

Instantly share code, notes, and snippets.

@dmyersturnbull
Last active January 20, 2017 04:12
Show Gist options
  • Save dmyersturnbull/cfb7dd5d6b53e1ec9437d65581a32f34 to your computer and use it in GitHub Desktop.
Save dmyersturnbull/cfb7dd5d6b53e1ec9437d65581a32f34 to your computer and use it in GitHub Desktop.
Extract metadata from Gene Expression Omnibus and other NCBI resources to match up identifiers, etc. For PMID:26060301.
# Douglas Myers-Turnbull wrote this while at UCSF. Because of this, the list of copyright owners is unknown and is not licensed (sorry!).
from dl_and_rezip import dl_and_rezip # see https://gist.github.com/dmyersturnbull/a6591676fc98da355c5250d48e26844e
from lines import lines
from typing import Mapping, Iterable, Optional, Iterator, Callable
import os
import warnings
import pandas as pd
import re
def unique(x: Iterable[str], name: str='<unknown>') -> Optional[str]:
"""Get the sole element in x or None if the size is not 1. Warns if multiple elements are in x but not if x is empty."""
if len(x) == 1: return x[0]
elif len(x) > 1: warnings.warn("{} results for {}".format(len(x), name))
return None # if len is 0, don't warn because the caller will know anyway
def grab(filename: str, starts: Mapping[str, str], strip: bool=True) -> Mapping[str, Iterable[str]]:
"""Example:
Contents of filename.txt:
abcdAString
abcdADifferentString
xyzYetAnotherString
starts: {'AB': 'abcd', 'XY': 'xyz'}
returns: {'AB': ['AString', 'ADifferentString'], 'xyz': 'YetAnotherString'}
"""
values = {k: [] for k, v in starts.items()}
for line in lines(filename):
for start_key, start_value in starts.items():
if line.startswith(start_value):
values[start_key].append(line[len(start_value): ].strip())
return values
default_gsm_starts = {'chip_id': '!Sample_characteristics_ch1 = c1 chip id: ',
'age': '!Sample_characteristics_ch1 = age:',
'srx': '!Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=',
'supplemental_file': '!Sample_supplementary_file_1 = '}
#'cell_type': '!Sample_characteristics_ch1 = cell type:',
# 'tissue': '!Sample_characteristics_ch1 = tissue:',
def generate_rows(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> Iterator[str]:
"""From a GSE accession, extract the ends of lines in the GSE text file that begin with a string in the values of gsm_starts."""
if gsm_starts is None: gsm_starts = default_gsm_starts
for gsm in grab(gse + '.txt.gz', {'sample': '!Series_sample_id = '})['sample']:
dl_and_rezip("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&form=text".format(gsm), gsm + '.txt')
items = {k: unique(v, name=k) for k, v in grab(gsm + '.txt.gz', gsm_starts).items()}
yield (gsm, *items.values())
def build_table(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> pd.DataFrame:
"""Iterate over results from generate_rows to build a Pandas DataFrame."""
if gsm_starts is None: gsm_starts = default_gsm_starts
return pd.DataFrame.from_records(list(generate_rows(gse, gsm_starts=gsm_starts)), columns=['gsm'] + list(gsm_starts.keys())).set_index('gsm')
df = build_table('GSE67835')
quake_pattern = re.compile('(GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz')
def quake(url: str) -> str:
"""Extract an ID like 1772078217_C03 from a supplemental filename that looks like,
ex: ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1658nnn/GSM1658365/suppl/GSM1658365_nochipID4.C95.csv.gz
Specifically, return group1 + '_' + group2 from (GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz
"""
match = quake_pattern.search(url)
if match is None: return None
# TODO: verify that group(1) matches the GSM in the index
return match.group(2) + '_' + match.group(3)
df['quake_id'] = df['supplemental_file'].map(quake)
df = df.drop('supplemental_file', axis=1)
srr_pattern = re.compile('SRR\d+')
def grab_srr(srx: str) -> Optional[str]:
"""Download an SRA HTML file and extract the first substring matching SRR\d+
Ex: grab_srr('SRX995861')
"""
dl_and_rezip('https://www.ncbi.nlm.nih.gov/sra/' + srx, srx + '.txt')
match = srr_pattern.search(''.join(lines(srx + '.txt.gz')))
return None if match is None else match.group(0)
df['srr'] = df['srx'].map(grab_srr)
df = df.drop('chip_id', axis=1)
# This is a very bad-form unit test :(
assert ['SRR1975000'] == list(df.reset_index()[df.reset_index()['gsm'] == 'GSM1658358']['srr'])
df.to_csv('samples.csv.gz', compression='gzip')
df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment