brentp · May 2, 2018 13:30
diff --git a/annotate-genes.py b/annotate-genes.py
 """
 create a BED file with position and strand given the genes from a
 TCGA level 3 expression matrix
 """
 from cruzdb import Genome
 from toolshed import reader
 import sys
 import os.path as op

 matrix = sys.argv[1] # has a column named "probe" that is a refGen name

 if not op.exists('hg19.db'):
    g = Genome('hg19').mirror(('refGene',) 'sqlite:///hg19.db')
 else:
    g = Genome('hg19.db')

 print "#chrom\tstart\tend\tprobe\tstrand\tsymbol"
 for d in reader('brca-expr.matrix.txt'):
    res = [r for r in g.refGene.filter_by(name2=d['probe']) if not "_" in
            r.chrom]
    if len(set(r.chrom for r in res)) != 1:
        continue # FAM138F

    if res == []:
        print >>sys.stderr, d['probe']
        1/0
    else:
        r = res[0]
        name = r.name2

    print "\t".join(map(str, (r.chrom, r.start, r.end, d['probe'], r.strand,
        name)))
diff --git a/shared-samples.py b/shared-samples.py
 """
 given 2 FILE_SAMPLE_MAP.txt, from 2 different platforms (e.g. methylation
 and expression), create a new FILE_SAMPLE_MAP.shared.txt for each that contains
 only the samples that are in both files
 """

 import sys
 from toolshed import reader
 from operator import itemgetter

 f1 = sys.argv[1]
 f2 = sys.argv[2]

 def tcga_to_info(tcga_id):
    """
    https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
    """
    tokens = tcga_id.split("-")
    assert tokens[0] == "TCGA", (tokens)
    keys = ('tissue_source', 'participant', 'sample_type_vial',
            'portion_analyte', 'plate', 'center')
    d = dict(zip(keys, tokens[1:]))
    sv = d.pop('sample_type_vial')
    st = sv[:2]
    st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
    d['sample_type'] = st
    d['vial'] = sv[2:]
    pa = d.pop('portion_analyte')
    d['portion'] = pa[:2]
    d['analyte'] = pa[2:]
    d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
    return d['id']


 shared = [tcga_to_info(d[1]) for d in reader(f1, header=False) if d if d[0] !=
        "filename"]

 assert len(shared) == len(set(shared))

 shared2 = [tcga_to_info(d[1]) for d in reader(f2, header=False) if d if d[0] !=
        "filename"]
 assert len(shared2) == len(set(shared2))

 shared = set(shared2).intersection(shared)

 def print_new(f, shared):
    fnew = open(f[:-4] + ".shared.txt", "w")
    out = []
    for i, d in enumerate(reader(f, header="ordered")):
        if not d: continue
        if i == 0:
            print >>fnew, "\t".join(d.keys())

        if not tcga_to_info(d['barcode(s)']) in shared: continue
        out.append(d.values())
    # sort so we get the same order for both.
    print out[:10]
    for line in sorted(out, key=itemgetter(1)):
        print >>fnew, "\t".join(line)
    print fnew.name

 print_new(f1, shared)
 print_new(f2, shared)

diff --git a/tcga-as-matrix.py b/tcga-as-matrix.py
 """
 convert single-file methylation values from TCGA into a matrix
 with rows of probes and columns of samples
 can logit-transform, sort by position, and output row-names of chr:pos
 Also extracts out the TCGA-... id's into a file with a bit of clinical info
 """
 import sys
 import os.path as op
 from itertools import izip
 from math import log

 def get_sample_map(fname):
    """
    map file name to sample e.g.:
    jhu-usc.edu_BRCA.HumanMethylation450.1.lvl-3.TCGA-A7-A0CE-11A-21D-A10Q-05.txt	TCGA-A7-A0CE-11A-21D-A10Q-05
    this is needed for expression files, for example
    """
    return dict(t.rstrip('\r\n').split("\t") for t in open(fname) if t.strip())

 def main(prefix, logit, locs, do_sort, file_sample_map, files):
    file_map = get_sample_map(file_sample_map)

    # sort by the sample-id so we can get same order for expr, meth shoud it be
    # needed
    files = [f for s, f in sorted((tcga_to_info(file_map[op.basename(f)])['id'], f) for f in
        files)]

    file_iters = [(x.rstrip("\r\n").split("\t") for x in open(f)) for f in files]

    fh_meth = open('%s.matrix.txt' % prefix, 'w')
    fh_covs = open('%s.covs.txt' % prefix, 'w')


    def tryfloat(v):
        try:
            v = float(v)
        except ValueError:
            return ""
        return "%.3f" % (log(v / (1 - v)) if logit else v)

    out_lines = []
    for i, rows in enumerate(izip(*file_iters)):
        assert len(set(r[0] for r in rows)) == 1, rows
        if i == 0:
            #samples = [r[1] for r in rows]
            samples = [file_map[op.basename(f)] for f in files]
            samp_ids = print_sample_info(samples, fh_covs)

            print >>fh_meth, "\t".join(["probe"] + samp_ids)
        if i < 2: continue
        vals = [r[1] for r in rows]
        if all(v == "NA" for v in vals): continue

        vals = [tryfloat(v) for v in vals]
        if locs:
            chrom, pos = rows[0][-2], int(rows[0][-1])
            loc = "%s:%i" % (chrom, pos)
        else:
            loc = r[0]

        row = "\t".join([loc] + vals)
        if do_sort:
            out_lines.append((chrom, pos, row))
        else:
            print >>fh_meth, row
    if do_sort:
        print >>fh_meth, "\n".join(d[2] for d in sorted(out_lines))

 def print_sample_info(samples, fh):

    ids = []
    for j, sample in enumerate(samples):
        info = tcga_to_info(sample)
        sub_id = info.pop('id')
        if j == 0:
            print >>fh, "\t".join(['id', 'full_id'] + info.keys())
        print >>fh, "\t".join([sub_id, sample] + info.values())
        ids.append(sub_id)
    return ids

 def tcga_to_info(tcga_id):
    """
    https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
    """
    tokens = tcga_id.split("-")
    assert tokens[0] == "TCGA", (tokens)
    keys = ('tissue_source', 'participant', 'sample_type_vial',
            'portion_analyte', 'plate', 'center')
    d = dict(zip(keys, tokens[1:]))
    sv = d.pop('sample_type_vial')
    st = sv[:2]
    st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
    d['sample_type'] = st
    d['vial'] = sv[2:]
    pa = d.pop('portion_analyte')
    d['portion'] = pa[:2]
    d['analyte'] = pa[2:]
    d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
    return d




 if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser(__doc__)
    ap.add_argument("--logit", action="store_true")
    ap.add_argument("--sorted", action="store_true", help="sort output by"
            " chrom, position; hold everything in memory")
    ap.add_argument("--locs", action="store_true", help="use e.g. 1:21434"
            "(chrom:pos) as the probe name rather than cg34544353")
    ap.add_argument('--prefix', help="output prefix")
    ap.add_argument('file_sample_map', help="path to FILE_SAMPLE_MAP.txt"
            "included in the download from TCGA")
    ap.add_argument('files', nargs='+', help="data-files")

    args = ap.parse_args()
    if not args.prefix:
        sys.exit(ap.print_help())
    main(args.prefix, args.logit, args.locs, args.sorted, args.file_sample_map, args.files)
	"""
	create a BED file with position and strand given the genes from a
	TCGA level 3 expression matrix
	"""
	from cruzdb import Genome
	from toolshed import reader
	import sys
	import os.path as op

	matrix = sys.argv[1] # has a column named "probe" that is a refGen name

	if not op.exists('hg19.db'):
	g = Genome('hg19').mirror(('refGene',) 'sqlite:///hg19.db')
	else:
	g = Genome('hg19.db')

	print "#chrom\tstart\tend\tprobe\tstrand\tsymbol"
	for d in reader('brca-expr.matrix.txt'):
	res = [r for r in g.refGene.filter_by(name2=d['probe']) if not "_" in
	r.chrom]
	if len(set(r.chrom for r in res)) != 1:
	continue # FAM138F

	if res == []:
	print >>sys.stderr, d['probe']
	1/0
	else:
	r = res[0]
	name = r.name2

	print "\t".join(map(str, (r.chrom, r.start, r.end, d['probe'], r.strand,
	name)))
	"""
	given 2 FILE_SAMPLE_MAP.txt, from 2 different platforms (e.g. methylation
	and expression), create a new FILE_SAMPLE_MAP.shared.txt for each that contains
	only the samples that are in both files
	"""

	import sys
	from toolshed import reader
	from operator import itemgetter

	f1 = sys.argv[1]
	f2 = sys.argv[2]

	def tcga_to_info(tcga_id):
	"""
	https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
	"""
	tokens = tcga_id.split("-")
	assert tokens[0] == "TCGA", (tokens)
	keys = ('tissue_source', 'participant', 'sample_type_vial',
	'portion_analyte', 'plate', 'center')
	d = dict(zip(keys, tokens[1:]))
	sv = d.pop('sample_type_vial')
	st = sv[:2]
	st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
	d['sample_type'] = st
	d['vial'] = sv[2:]
	pa = d.pop('portion_analyte')
	d['portion'] = pa[:2]
	d['analyte'] = pa[2:]
	d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
	return d['id']


	shared = [tcga_to_info(d[1]) for d in reader(f1, header=False) if d if d[0] !=
	"filename"]

	assert len(shared) == len(set(shared))

	shared2 = [tcga_to_info(d[1]) for d in reader(f2, header=False) if d if d[0] !=
	"filename"]
	assert len(shared2) == len(set(shared2))

	shared = set(shared2).intersection(shared)

	def print_new(f, shared):
	fnew = open(f[:-4] + ".shared.txt", "w")
	out = []
	for i, d in enumerate(reader(f, header="ordered")):
	if not d: continue
	if i == 0:
	print >>fnew, "\t".join(d.keys())

	if not tcga_to_info(d['barcode(s)']) in shared: continue
	out.append(d.values())
	# sort so we get the same order for both.
	print out[:10]
	for line in sorted(out, key=itemgetter(1)):
	print >>fnew, "\t".join(line)
	print fnew.name

	print_new(f1, shared)
	print_new(f2, shared)
	"""
	convert single-file methylation values from TCGA into a matrix
	with rows of probes and columns of samples
	can logit-transform, sort by position, and output row-names of chr:pos
	Also extracts out the TCGA-... id's into a file with a bit of clinical info
	"""
	import sys
	import os.path as op
	from itertools import izip
	from math import log

	def get_sample_map(fname):
	"""
	map file name to sample e.g.:
	jhu-usc.edu_BRCA.HumanMethylation450.1.lvl-3.TCGA-A7-A0CE-11A-21D-A10Q-05.txt TCGA-A7-A0CE-11A-21D-A10Q-05
	this is needed for expression files, for example
	"""
	return dict(t.rstrip('\r\n').split("\t") for t in open(fname) if t.strip())

	def main(prefix, logit, locs, do_sort, file_sample_map, files):
	file_map = get_sample_map(file_sample_map)

	# sort by the sample-id so we can get same order for expr, meth shoud it be
	# needed
	files = [f for s, f in sorted((tcga_to_info(file_map[op.basename(f)])['id'], f) for f in
	files)]

	file_iters = [(x.rstrip("\r\n").split("\t") for x in open(f)) for f in files]

	fh_meth = open('%s.matrix.txt' % prefix, 'w')
	fh_covs = open('%s.covs.txt' % prefix, 'w')


	def tryfloat(v):
	try:
	v = float(v)
	except ValueError:
	return ""
	return "%.3f" % (log(v / (1 - v)) if logit else v)

	out_lines = []
	for i, rows in enumerate(izip(*file_iters)):
	assert len(set(r[0] for r in rows)) == 1, rows
	if i == 0:
	#samples = [r[1] for r in rows]
	samples = [file_map[op.basename(f)] for f in files]
	samp_ids = print_sample_info(samples, fh_covs)

	print >>fh_meth, "\t".join(["probe"] + samp_ids)
	if i < 2: continue
	vals = [r[1] for r in rows]
	if all(v == "NA" for v in vals): continue

	vals = [tryfloat(v) for v in vals]
	if locs:
	chrom, pos = rows[0][-2], int(rows[0][-1])
	loc = "%s:%i" % (chrom, pos)
	else:
	loc = r[0]

	row = "\t".join([loc] + vals)
	if do_sort:
	out_lines.append((chrom, pos, row))
	else:
	print >>fh_meth, row
	if do_sort:
	print >>fh_meth, "\n".join(d[2] for d in sorted(out_lines))

	def print_sample_info(samples, fh):

	ids = []
	for j, sample in enumerate(samples):
	info = tcga_to_info(sample)
	sub_id = info.pop('id')
	if j == 0:
	print >>fh, "\t".join(['id', 'full_id'] + info.keys())
	print >>fh, "\t".join([sub_id, sample] + info.values())
	ids.append(sub_id)
	return ids

	def tcga_to_info(tcga_id):
	"""
	https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
	"""
	tokens = tcga_id.split("-")
	assert tokens[0] == "TCGA", (tokens)
	keys = ('tissue_source', 'participant', 'sample_type_vial',
	'portion_analyte', 'plate', 'center')
	d = dict(zip(keys, tokens[1:]))
	sv = d.pop('sample_type_vial')
	st = sv[:2]
	st = {'01': 'tumor', '11': 'normal', '10': 'blood'}.get(st, st)
	d['sample_type'] = st
	d['vial'] = sv[2:]
	pa = d.pop('portion_analyte')
	d['portion'] = pa[:2]
	d['analyte'] = pa[2:]
	d['id'] = "{participant}.{sample_type}.{tissue_source}".format(**d)
	return d




	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser(__doc__)
	ap.add_argument("--logit", action="store_true")
	ap.add_argument("--sorted", action="store_true", help="sort output by"
	" chrom, position; hold everything in memory")
	ap.add_argument("--locs", action="store_true", help="use e.g. 1:21434"
	"(chrom:pos) as the probe name rather than cg34544353")
	ap.add_argument('--prefix', help="output prefix")
	ap.add_argument('file_sample_map', help="path to FILE_SAMPLE_MAP.txt"
	"included in the download from TCGA")
	ap.add_argument('files', nargs='+', help="data-files")

	args = ap.parse_args()
	if not args.prefix:
	sys.exit(ap.print_help())
	main(args.prefix, args.logit, args.locs, args.sorted, args.file_sample_map, args.files)