hussius · November 17, 2016 09:39 · hussius · Nov 17, 2016
diff --git a/sum_by_gene.py b/sum_by_gene.py
 import sys
 import gzip 

 if len(sys.argv)<3:
    sys.exit("python sum_per_gene.py <cDNA FASTA file> <TPM table>")

 ensg = {}

 mapf = gzip.open(sys.argv[1])
 ctr = 0
 for line in mapf:
    if not line.startswith('>'): continue
    ctr += 1
    if ctr % 1000 == 0: sys.stderr.write(ctr)
    c = line.split(' ')
    tx = c[0][1:]
    gene = c[3].split(':')[1].split('.')[0]
    ensg[tx] = gene

 gene_tpm = {}

 tpm = open(sys.argv[2])
 print(tpm.readline().strip()) # Header

 for line in tpm:
    c = line.split()
    tx = c[0]
    vals = []
    for val in c[1:len(c)]:
        vals.append(float(val))
    gene = ensg[tx]
    if gene in gene_tpm:
        prev = gene_tpm[gene] 
        curr = vals
        new = [prev[i]+curr[i] for i in range(0,len(prev))]
        gene_tpm[gene] = new
    else:
        gene_tpm[gene] = vals

 for g in gene_tpm:
    string_rep = [x for x in map(str, gene_tpm[g])]
    print(g + '\t' + '\t'.join(string_rep))
	import sys
	import gzip

	if len(sys.argv)<3:
	sys.exit("python sum_per_gene.py <cDNA FASTA file> <TPM table>")

	ensg = {}

	mapf = gzip.open(sys.argv[1])
	ctr = 0
	for line in mapf:
	if not line.startswith('>'): continue
	ctr += 1
	if ctr % 1000 == 0: sys.stderr.write(ctr)
	c = line.split(' ')
	tx = c[0][1:]
	gene = c[3].split(':')[1].split('.')[0]
	ensg[tx] = gene

	gene_tpm = {}

	tpm = open(sys.argv[2])
	print(tpm.readline().strip()) # Header

	for line in tpm:
	c = line.split()
	tx = c[0]
	vals = []
	for val in c[1:len(c)]:
	vals.append(float(val))
	gene = ensg[tx]
	if gene in gene_tpm:
	prev = gene_tpm[gene]
	curr = vals
	new = [prev[i]+curr[i] for i in range(0,len(prev))]
	gene_tpm[gene] = new
	else:
	gene_tpm[gene] = vals

	for g in gene_tpm:
	string_rep = [x for x in map(str, gene_tpm[g])]
	print(g + '\t' + '\t'.join(string_rep))