Skip to content

Instantly share code, notes, and snippets.

@Phlya
Created December 2, 2020 20:24
Show Gist options
  • Save Phlya/4b952fe035991e8f24e87a1575fc57b1 to your computer and use it in GitHub Desktop.
Save Phlya/4b952fe035991e8f24e87a1575fc57b1 to your computer and use it in GitHub Desktop.
import sys
import argparse
import re
import numpy as np
import pandas as pd
import pysradb
# def unescaped_str(arg_str):
# """
# Borrowed from https://stackoverflow.com/questions/34145686/handling-argparse-escaped-character-as-option
# """
# return codecs.decode(str(arg_str), 'unicode-escape')
class SmartFormatter(argparse.HelpFormatter):
'''
Custom Help Formatter used to split help text when '\n' was
inserted in it.
'''
def _split_lines(self, text, width):
r = []
for t in text.splitlines(): r.extend(argparse.HelpFormatter._split_lines(self, t, width))
return r
def parse_args(args):
parser = argparse.ArgumentParser(
description='Create the input section for distiller\'s project.yml from GEO/ENA/SRA accessions.',
# formatter_class=argparse.RawDescriptionHelpFormatter
formatter_class=SmartFormatter
)
parser.add_argument(
'accessions',
metavar='N',
type=str,
nargs='+',
help='GEO/SRA/ENA accession with a Hi-C project. Multiple values are allowed.')
parser.add_argument(
'--title_sub',
nargs=2,
action='append',
# type=unescaped_str,
default = [],
help='A list of regular expression substitutions to clean up the experiment titles. '
'Multiple sequential substitutions are allowed. '
'Each substitution must be provided using a separate flag --title_sub followed by '
'a pair of regular expressions pat repl, separated by a space, '
'where pat is the matching pattern and repl is the replacement string. '
'Internally, these expressions are then provided to pandas.Series.str.replace() or re.sub(). '
'The default substitutions (1) replace spaces with underscores and (2) remove characters not matching '
'A–Z a–z 0–9 ._- (a.k.a. the POSIX portable file name character set):'
'\n'
'--title_sub \'\\s\' \'_\' --title_sub \'[^\\w_.-]\' \'\''
)
parser.add_argument(
'--title_column',
type=str,
default = 'experiment_title',
help='Name of the sra metadata column to use'
)
parser.add_argument(
'--group_sub',
nargs=2,
action='append',
# type=unescaped_str,
default = [],
help='A list of regular expression substitutions to convert experiment titles into groups. '
'The usage is same as above. The default substitution removes patterns like _R1/_R2/_rep1/-R1/R1 '
'at the end of the experiment title:'
'\n'
'--group_sub \'[_-](R|rep)[\\d+]$\' \'\''
)
parser.add_argument(
'--filter_pre',
nargs=1,
action='append',
default = [],
type=str,
help='A regular expression to filter datasets by their *unedited* name. '
'If multiple filters are provided, select datasets that satisfy at least one of the filters. '
'--filter \'[Hh][Ii]-?[Cc]\''
)
parser.add_argument(
'--filter_post',
action='append',
default = [],
type=str,
help='A regular expression to filter datasets by their *edited* name. '
'If multiple filters are provided, select datasets that satisfy at least one of the filters. '
'--filter \'[Hh][Ii]-?[Cc]\''
)
return parser.parse_args(args)
def to_downloadable(queries):
out_queries = []
for q in queries:
if q.startswith('GSE'):
out_queries += list(
pysradb.SRAweb()
.gse_to_srp(q)
.study_accession
)
else:
out_queries.append(q)
return out_queries
DEFAULT_TITLE_SUB = [
('\s', '_'),
('[^\w_.-]', '') # the first character cannot be a hyphen!!
]
DEFAULT_GROUP_SUB = [
('[_-](R|rep)_?[\d+]$', '')
]
TAB_CHAR = ' '
args = parse_args(sys.argv[1:])
db = pysradb.SRAweb()
queries = to_downloadable(args.accessions)
srr_table = pd.concat([
db.sra_metadata(q)
for q in queries
])
srr_table = srr_table[[args.title_column, 'run_accession']].astype(str)
# srr_table[args.title_column] = (
# srr_table[args.title_column]
# .str.split(';')
# .str.get(0)
# .str.split(':', n=1)
# .str.get(1)
# .str.strip()
# )
if args.filter_pre:
mask = np.logical_or.reduce([
srr_table[args.title_column].str.contains(fltr, regex=True)
for fltr in list(args.filter_pre)])
srr_table = srr_table[mask]
for re_sub in (args.title_sub if args.title_sub else DEFAULT_TITLE_SUB):
srr_table[args.title_column] = (
srr_table[args.title_column]
.str.replace(re_sub[0], re_sub[1], regex=True)
)
if args.filter_post:
mask = np.logical_or.reduce([
srr_table[args.title_column].str.contains(fltr, regex=True)
for fltr in list(args.filter_post)])
srr_table = srr_table[mask]
srr_table=srr_table.sort_values([args.title_column,'run_accession'])
srr_table['lane'] = (
'lane'
+ (srr_table.groupby(args.title_column).cumcount()+1)
.astype('str')
)
group = srr_table[args.title_column]
for sub in (args.group_sub if args.group_sub else DEFAULT_GROUP_SUB):
group = group.str.replace(sub[0], sub[1])
srr_table['group'] = group
# Keeping this code in case YAML structures will become useful:
# out_raw_reads_paths = {}
# for title, grouped in srr_table.groupby('experiment_title'):
# out_raw_reads_paths[title] = {
# row.lane:f'- sra:{row.run_accession}'
# for _,row in grouped.iterrows()
# }
# out_library_groups = {}
# for group, grouped in srr_table.groupby('group'):
# experiment_titles = list(grouped.experiment_title.unique())
# if len(experiment_titles) > 1:
# out_library_groups[group] = experiment_titles
out_raw_reads_paths = [f'{TAB_CHAR}raw_reads_paths:']
for title, grouped in srr_table.groupby(args.title_column):
out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{title}:')
for _, row in grouped.iterrows():
out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}{row.lane}:')
out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}- sra:{row.run_accession}')
out_library_groups = [f'{TAB_CHAR}library_groups:']
for group, grouped in srr_table.groupby('group'):
experiment_titles = grouped[args.title_column].unique()
if len(experiment_titles) > 1:
out_library_groups.append(f'{TAB_CHAR}{TAB_CHAR}{group}:')
out_library_groups += [f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}- {title}'
for title in experiment_titles]
out = '\n'.join(['input:']+out_raw_reads_paths+out_library_groups)
print(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment