dmyersturnbull · January 20, 2017 04:12
diff --git a/srr_quake.py b/srr_quake.py

 # Douglas Myers-Turnbull wrote this while at UCSF. Because of this, the list of copyright owners is unknown and is not licensed (sorry!).

 from dl_and_rezip import dl_and_rezip # see https://gist.github.com/dmyersturnbull/a6591676fc98da355c5250d48e26844e
 from lines import lines
 from typing import Mapping, Iterable, Optional, Iterator, Callable
 import os
 import warnings
 import pandas as pd
 import re


 def unique(x: Iterable[str], name: str='<unknown>') -> Optional[str]:
    """Get the sole element in x or None if the size is not 1. Warns if multiple elements are in x but not if x is empty."""
    if len(x) == 1: return x[0]
    elif len(x) > 1: warnings.warn("{} results for {}".format(len(x), name))
    return None # if len is 0, don't warn because the caller will know anyway


 def grab(filename: str, starts: Mapping[str, str], strip: bool=True) -> Mapping[str, Iterable[str]]:
    """Example:
    Contents of filename.txt:
        abcdAString
        abcdADifferentString
        xyzYetAnotherString
    starts: {'AB': 'abcd', 'XY': 'xyz'}
    returns: {'AB': ['AString', 'ADifferentString'], 'xyz': 'YetAnotherString'}
    """
    values = {k: [] for k, v in starts.items()}
    for line in lines(filename):
        for start_key, start_value in starts.items():
            if line.startswith(start_value):
                values[start_key].append(line[len(start_value): ].strip())
    return values


 default_gsm_starts = {'chip_id': '!Sample_characteristics_ch1 = c1 chip id: ',
        'age': '!Sample_characteristics_ch1 = age:',
        'srx': '!Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=',
        'supplemental_file': '!Sample_supplementary_file_1 = '}
 #'cell_type': '!Sample_characteristics_ch1 = cell type:',
 # 'tissue': '!Sample_characteristics_ch1 = tissue:',


 def generate_rows(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> Iterator[str]:
    """From a GSE accession, extract the ends of lines in the GSE text file that begin with a string in the values of gsm_starts."""
    if gsm_starts is None: gsm_starts = default_gsm_starts
    for gsm in grab(gse + '.txt.gz', {'sample': '!Series_sample_id = '})['sample']:
        dl_and_rezip("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&form=text".format(gsm), gsm + '.txt')
        items = {k: unique(v, name=k) for k, v in grab(gsm + '.txt.gz', gsm_starts).items()}
        yield (gsm, *items.values())

        
 def build_table(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> pd.DataFrame:
    """Iterate over results from generate_rows to build a Pandas DataFrame."""
    if gsm_starts is None: gsm_starts = default_gsm_starts
    return pd.DataFrame.from_records(list(generate_rows(gse, gsm_starts=gsm_starts)), columns=['gsm'] + list(gsm_starts.keys())).set_index('gsm')


 df = build_table('GSE67835')


 quake_pattern = re.compile('(GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz')
 def quake(url: str) -> str:
    """Extract an ID like 1772078217_C03 from a supplemental filename that looks like, 
    ex: ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1658nnn/GSM1658365/suppl/GSM1658365_nochipID4.C95.csv.gz
    Specifically, return group1 + '_' + group2 from (GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz
    """
    match = quake_pattern.search(url)
    if match is None: return None
    # TODO: verify that group(1) matches the GSM in the index
    return match.group(2) + '_' + match.group(3)
 df['quake_id'] = df['supplemental_file'].map(quake)
 df = df.drop('supplemental_file', axis=1)


 srr_pattern = re.compile('SRR\d+')
 def grab_srr(srx: str) -> Optional[str]:
    """Download an SRA HTML file and extract the first substring matching SRR\d+
    Ex: grab_srr('SRX995861')
    """
    dl_and_rezip('https://www.ncbi.nlm.nih.gov/sra/' + srx, srx + '.txt')
    match = srr_pattern.search(''.join(lines(srx + '.txt.gz')))
    return None if match is None else match.group(0)


 df['srr'] = df['srx'].map(grab_srr)
 df = df.drop('chip_id', axis=1)

 # This is a very bad-form unit test :(
 assert ['SRR1975000'] == list(df.reset_index()[df.reset_index()['gsm'] == 'GSM1658358']['srr'])

 df.to_csv('samples.csv.gz', compression='gzip')
 df

	# Douglas Myers-Turnbull wrote this while at UCSF. Because of this, the list of copyright owners is unknown and is not licensed (sorry!).

	from dl_and_rezip import dl_and_rezip # see https://gist.github.com/dmyersturnbull/a6591676fc98da355c5250d48e26844e
	from lines import lines
	from typing import Mapping, Iterable, Optional, Iterator, Callable
	import os
	import warnings
	import pandas as pd
	import re


	def unique(x: Iterable[str], name: str='<unknown>') -> Optional[str]:
	"""Get the sole element in x or None if the size is not 1. Warns if multiple elements are in x but not if x is empty."""
	if len(x) == 1: return x[0]
	elif len(x) > 1: warnings.warn("{} results for {}".format(len(x), name))
	return None # if len is 0, don't warn because the caller will know anyway


	def grab(filename: str, starts: Mapping[str, str], strip: bool=True) -> Mapping[str, Iterable[str]]:
	"""Example:
	Contents of filename.txt:
	abcdAString
	abcdADifferentString
	xyzYetAnotherString
	starts: {'AB': 'abcd', 'XY': 'xyz'}
	returns: {'AB': ['AString', 'ADifferentString'], 'xyz': 'YetAnotherString'}
	"""
	values = {k: [] for k, v in starts.items()}
	for line in lines(filename):
	for start_key, start_value in starts.items():
	if line.startswith(start_value):
	values[start_key].append(line[len(start_value): ].strip())
	return values


	default_gsm_starts = {'chip_id': '!Sample_characteristics_ch1 = c1 chip id: ',
	'age': '!Sample_characteristics_ch1 = age:',
	'srx': '!Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=',
	'supplemental_file': '!Sample_supplementary_file_1 = '}
	#'cell_type': '!Sample_characteristics_ch1 = cell type:',
	# 'tissue': '!Sample_characteristics_ch1 = tissue:',


	def generate_rows(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> Iterator[str]:
	"""From a GSE accession, extract the ends of lines in the GSE text file that begin with a string in the values of gsm_starts."""
	if gsm_starts is None: gsm_starts = default_gsm_starts
	for gsm in grab(gse + '.txt.gz', {'sample': '!Series_sample_id = '})['sample']:
	dl_and_rezip("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&form=text".format(gsm), gsm + '.txt')
	items = {k: unique(v, name=k) for k, v in grab(gsm + '.txt.gz', gsm_starts).items()}
	yield (gsm, *items.values())


	def build_table(gse: str, gsm_starts: Optional[Mapping[str, str]]=None) -> pd.DataFrame:
	"""Iterate over results from generate_rows to build a Pandas DataFrame."""
	if gsm_starts is None: gsm_starts = default_gsm_starts
	return pd.DataFrame.from_records(list(generate_rows(gse, gsm_starts=gsm_starts)), columns=['gsm'] + list(gsm_starts.keys())).set_index('gsm')


	df = build_table('GSE67835')


	quake_pattern = re.compile('(GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz')
	def quake(url: str) -> str:
	"""Extract an ID like 1772078217_C03 from a supplemental filename that looks like,
	ex: ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1658nnn/GSM1658365/suppl/GSM1658365_nochipID4.C95.csv.gz
	Specifically, return group1 + '_' + group2 from (GSM\d+)_([^\.]+)\.([^\.]+)\.csv\.gz
	"""
	match = quake_pattern.search(url)
	if match is None: return None
	# TODO: verify that group(1) matches the GSM in the index
	return match.group(2) + '_' + match.group(3)
	df['quake_id'] = df['supplemental_file'].map(quake)
	df = df.drop('supplemental_file', axis=1)


	srr_pattern = re.compile('SRR\d+')
	def grab_srr(srx: str) -> Optional[str]:
	"""Download an SRA HTML file and extract the first substring matching SRR\d+
	Ex: grab_srr('SRX995861')
	"""
	dl_and_rezip('https://www.ncbi.nlm.nih.gov/sra/' + srx, srx + '.txt')
	match = srr_pattern.search(''.join(lines(srx + '.txt.gz')))
	return None if match is None else match.group(0)


	df['srr'] = df['srx'].map(grab_srr)
	df = df.drop('chip_id', axis=1)

	# This is a very bad-form unit test :(
	assert ['SRR1975000'] == list(df.reset_index()[df.reset_index()['gsm'] == 'GSM1658358']['srr'])

	df.to_csv('samples.csv.gz', compression='gzip')
	df