Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active May 2, 2018 13:30
Show Gist options
  • Save brentp/7121430 to your computer and use it in GitHub Desktop.
Save brentp/7121430 to your computer and use it in GitHub Desktop.
convert files values from TCGA, which generally contain values for a single sample per file into a matrix with rows of probes and columns of samples
"""
create a BED file with position and strand given the genes from a
TCGA level 3 expression matrix
"""
from cruzdb import Genome
from toolshed import reader
import sys
import os.path as op
matrix = sys.argv[1] # has a column named "probe" that is a refGen name
if not op.exists('hg19.db'):
g = Genome('hg19').mirror(('refGene',) 'sqlite:///hg19.db')
else:
g = Genome('hg19.db')
print "#chrom\tstart\tend\tprobe\tstrand\tsymbol"
for d in reader('brca-expr.matrix.txt'):
res = [r for r in g.refGene.filter_by(name2=d['probe']) if not "_" in
r.chrom]
if len(set(r.chrom for r in res)) != 1:
continue # FAM138F
if res == []:
print >>sys.stderr, d['probe']
1/0
else:
r = res[0]
name = r.name2
print "\t".join(map(str, (r.chrom, r.start, r.end, d['probe'], r.strand,
name)))
"""
given 2 FILE_SAMPLE_MAP.txt, from 2 different platforms (e.g. methylation
and expression), create a new FILE_SAMPLE_MAP.shared.txt for each that contains
only the samples that are in both files
"""
import sys
from toolshed import reader
from operator import itemgetter
f1 = sys.argv[1]
f2 = sys.argv[2]
def tcga_to_info(tcga_id):
"""
https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
"""
tokens = tcga_id.split("-")
assert tokens[0] == "TCGA", (tokens)
keys = ('tissue_source', 'participant', 'sample_type_vial',
'portion_analyte', 'plate', 'center')
d = dict(zip(keys, tokens[1:]))
sv = d.pop('sample_type_vial')
st = sv[:2]
st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
d['sample_type'] = st
d['vial'] = sv[2:]
pa = d.pop('portion_analyte')
d['portion'] = pa[:2]
d['analyte'] = pa[2:]
d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
return d['id']
shared = [tcga_to_info(d[1]) for d in reader(f1, header=False) if d if d[0] !=
"filename"]
assert len(shared) == len(set(shared))
shared2 = [tcga_to_info(d[1]) for d in reader(f2, header=False) if d if d[0] !=
"filename"]
assert len(shared2) == len(set(shared2))
shared = set(shared2).intersection(shared)
def print_new(f, shared):
fnew = open(f[:-4] + ".shared.txt", "w")
out = []
for i, d in enumerate(reader(f, header="ordered")):
if not d: continue
if i == 0:
print >>fnew, "\t".join(d.keys())
if not tcga_to_info(d['barcode(s)']) in shared: continue
out.append(d.values())
# sort so we get the same order for both.
print out[:10]
for line in sorted(out, key=itemgetter(1)):
print >>fnew, "\t".join(line)
print fnew.name
print_new(f1, shared)
print_new(f2, shared)
"""
convert single-file methylation values from TCGA into a matrix
with rows of probes and columns of samples
can logit-transform, sort by position, and output row-names of chr:pos
Also extracts out the TCGA-... id's into a file with a bit of clinical info
"""
import sys
import os.path as op
from itertools import izip
from math import log
def get_sample_map(fname):
"""
map file name to sample e.g.:
jhu-usc.edu_BRCA.HumanMethylation450.1.lvl-3.TCGA-A7-A0CE-11A-21D-A10Q-05.txt TCGA-A7-A0CE-11A-21D-A10Q-05
this is needed for expression files, for example
"""
return dict(t.rstrip('\r\n').split("\t") for t in open(fname) if t.strip())
def main(prefix, logit, locs, do_sort, file_sample_map, files):
file_map = get_sample_map(file_sample_map)
# sort by the sample-id so we can get same order for expr, meth shoud it be
# needed
files = [f for s, f in sorted((tcga_to_info(file_map[op.basename(f)])['id'], f) for f in
files)]
file_iters = [(x.rstrip("\r\n").split("\t") for x in open(f)) for f in files]
fh_meth = open('%s.matrix.txt' % prefix, 'w')
fh_covs = open('%s.covs.txt' % prefix, 'w')
def tryfloat(v):
try:
v = float(v)
except ValueError:
return ""
return "%.3f" % (log(v / (1 - v)) if logit else v)
out_lines = []
for i, rows in enumerate(izip(*file_iters)):
assert len(set(r[0] for r in rows)) == 1, rows
if i == 0:
#samples = [r[1] for r in rows]
samples = [file_map[op.basename(f)] for f in files]
samp_ids = print_sample_info(samples, fh_covs)
print >>fh_meth, "\t".join(["probe"] + samp_ids)
if i < 2: continue
vals = [r[1] for r in rows]
if all(v == "NA" for v in vals): continue
vals = [tryfloat(v) for v in vals]
if locs:
chrom, pos = rows[0][-2], int(rows[0][-1])
loc = "%s:%i" % (chrom, pos)
else:
loc = r[0]
row = "\t".join([loc] + vals)
if do_sort:
out_lines.append((chrom, pos, row))
else:
print >>fh_meth, row
if do_sort:
print >>fh_meth, "\n".join(d[2] for d in sorted(out_lines))
def print_sample_info(samples, fh):
ids = []
for j, sample in enumerate(samples):
info = tcga_to_info(sample)
sub_id = info.pop('id')
if j == 0:
print >>fh, "\t".join(['id', 'full_id'] + info.keys())
print >>fh, "\t".join([sub_id, sample] + info.values())
ids.append(sub_id)
return ids
def tcga_to_info(tcga_id):
"""
https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
"""
tokens = tcga_id.split("-")
assert tokens[0] == "TCGA", (tokens)
keys = ('tissue_source', 'participant', 'sample_type_vial',
'portion_analyte', 'plate', 'center')
d = dict(zip(keys, tokens[1:]))
sv = d.pop('sample_type_vial')
st = sv[:2]
st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
d['sample_type'] = st
d['vial'] = sv[2:]
pa = d.pop('portion_analyte')
d['portion'] = pa[:2]
d['analyte'] = pa[2:]
d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
return d
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser(__doc__)
ap.add_argument("--logit", action="store_true")
ap.add_argument("--sorted", action="store_true", help="sort output by"
" chrom, position; hold everything in memory")
ap.add_argument("--locs", action="store_true", help="use e.g. 1:21434"
"(chrom:pos) as the probe name rather than cg34544353")
ap.add_argument('--prefix', help="output prefix")
ap.add_argument('file_sample_map', help="path to FILE_SAMPLE_MAP.txt"
"included in the download from TCGA")
ap.add_argument('files', nargs='+', help="data-files")
args = ap.parse_args()
if not args.prefix:
sys.exit(ap.print_help())
main(args.prefix, args.logit, args.locs, args.sorted, args.file_sample_map, args.files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment