Last active
May 2, 2018 13:30
-
-
Save brentp/7121430 to your computer and use it in GitHub Desktop.
convert files values from TCGA, which generally contain values for a single sample per file into a matrix with rows of probes and columns of samples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
create a BED file with position and strand given the genes from a | |
TCGA level 3 expression matrix | |
""" | |
from cruzdb import Genome | |
from toolshed import reader | |
import sys | |
import os.path as op | |
matrix = sys.argv[1] # has a column named "probe" that is a refGen name | |
if not op.exists('hg19.db'): | |
g = Genome('hg19').mirror(('refGene',) 'sqlite:///hg19.db') | |
else: | |
g = Genome('hg19.db') | |
print "#chrom\tstart\tend\tprobe\tstrand\tsymbol" | |
for d in reader('brca-expr.matrix.txt'): | |
res = [r for r in g.refGene.filter_by(name2=d['probe']) if not "_" in | |
r.chrom] | |
if len(set(r.chrom for r in res)) != 1: | |
continue # FAM138F | |
if res == []: | |
print >>sys.stderr, d['probe'] | |
1/0 | |
else: | |
r = res[0] | |
name = r.name2 | |
print "\t".join(map(str, (r.chrom, r.start, r.end, d['probe'], r.strand, | |
name))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
convert single-file methylation values from TCGA into a matrix | |
with rows of probes and columns of samples | |
can logit-transform, sort by position, and output row-names of chr:pos | |
Also extracts out the TCGA-... id's into a file with a bit of clinical info | |
""" | |
import sys | |
import os.path as op | |
from itertools import izip | |
from math import log | |
def get_sample_map(fname): | |
""" | |
map file name to sample e.g.: | |
jhu-usc.edu_BRCA.HumanMethylation450.1.lvl-3.TCGA-A7-A0CE-11A-21D-A10Q-05.txt TCGA-A7-A0CE-11A-21D-A10Q-05 | |
this is needed for expression files, for example | |
""" | |
return dict(t.rstrip('\r\n').split("\t") for t in open(fname) if t.strip()) | |
def main(prefix, logit, locs, do_sort, file_sample_map, files): | |
file_map = get_sample_map(file_sample_map) | |
# sort by the sample-id so we can get same order for expr, meth shoud it be | |
# needed | |
files = [f for s, f in sorted((tcga_to_info(file_map[op.basename(f)])['id'], f) for f in | |
files)] | |
file_iters = [(x.rstrip("\r\n").split("\t") for x in open(f)) for f in files] | |
fh_meth = open('%s.matrix.txt' % prefix, 'w') | |
fh_covs = open('%s.covs.txt' % prefix, 'w') | |
def tryfloat(v): | |
try: | |
v = float(v) | |
except ValueError: | |
return "" | |
return "%.3f" % (log(v / (1 - v)) if logit else v) | |
out_lines = [] | |
for i, rows in enumerate(izip(*file_iters)): | |
assert len(set(r[0] for r in rows)) == 1, rows | |
if i == 0: | |
#samples = [r[1] for r in rows] | |
samples = [file_map[op.basename(f)] for f in files] | |
samp_ids = print_sample_info(samples, fh_covs) | |
print >>fh_meth, "\t".join(["probe"] + samp_ids) | |
if i < 2: continue | |
vals = [r[1] for r in rows] | |
if all(v == "NA" for v in vals): continue | |
vals = [tryfloat(v) for v in vals] | |
if locs: | |
chrom, pos = rows[0][-2], int(rows[0][-1]) | |
loc = "%s:%i" % (chrom, pos) | |
else: | |
loc = r[0] | |
row = "\t".join([loc] + vals) | |
if do_sort: | |
out_lines.append((chrom, pos, row)) | |
else: | |
print >>fh_meth, row | |
if do_sort: | |
print >>fh_meth, "\n".join(d[2] for d in sorted(out_lines)) | |
def print_sample_info(samples, fh): | |
ids = [] | |
for j, sample in enumerate(samples): | |
info = tcga_to_info(sample) | |
sub_id = info.pop('id') | |
if j == 0: | |
print >>fh, "\t".join(['id', 'full_id'] + info.keys()) | |
print >>fh, "\t".join([sub_id, sample] + info.values()) | |
ids.append(sub_id) | |
return ids | |
def tcga_to_info(tcga_id): | |
""" | |
https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode | |
""" | |
tokens = tcga_id.split("-") | |
assert tokens[0] == "TCGA", (tokens) | |
keys = ('tissue_source', 'participant', 'sample_type_vial', | |
'portion_analyte', 'plate', 'center') | |
d = dict(zip(keys, tokens[1:])) | |
sv = d.pop('sample_type_vial') | |
st = sv[:2] | |
st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st) | |
d['sample_type'] = st | |
d['vial'] = sv[2:] | |
pa = d.pop('portion_analyte') | |
d['portion'] = pa[:2] | |
d['analyte'] = pa[2:] | |
d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d) | |
return d | |
if __name__ == "__main__": | |
import argparse | |
ap = argparse.ArgumentParser(__doc__) | |
ap.add_argument("--logit", action="store_true") | |
ap.add_argument("--sorted", action="store_true", help="sort output by" | |
" chrom, position; hold everything in memory") | |
ap.add_argument("--locs", action="store_true", help="use e.g. 1:21434" | |
"(chrom:pos) as the probe name rather than cg34544353") | |
ap.add_argument('--prefix', help="output prefix") | |
ap.add_argument('file_sample_map', help="path to FILE_SAMPLE_MAP.txt" | |
"included in the download from TCGA") | |
ap.add_argument('files', nargs='+', help="data-files") | |
args = ap.parse_args() | |
if not args.prefix: | |
sys.exit(ap.print_help()) | |
main(args.prefix, args.logit, args.locs, args.sorted, args.file_sample_map, args.files) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment