dwinter · June 1, 2012 10:37
diff --git a/Parse_multitree_paml.py b/Parse_multitree_paml.py
 #
 # Code to parse the interesting bits of a PAML results file that contains
 # statistics for several trees. Note, this approach contains some hacks that
 # are probably specific to multiple-tree CODEML files, Biopython has a module
 # for handling other PAML outputs, and it probably a better starting point for
 # 'normal' (single tree) input files.
 #
 # TODO - should clean up handlng of resifues - use @property decorator and
 # ._functions to simplify writing/reading.
 #


 import sys
 import re
 from collections import defaultdict
 from scipy.stats import chisqprob

 class PAMLResult(object):
    """Represent the results from one PAMl model for one tree """

    def __init__(self, lnL, residues =None):
        self.lnL = lnL
        self.residues = residues

    def __repr__(self):
        return 'PAMLResult(lnL= {0})'.format(self.lnL)


 class PAMLTree(dict):
    """Hold PAML restuls for one tree across several models"""

    def __init__(self, *args):
        dict.__init__(self, args)


    def _add_result(self, model, res):
        self[model] = res


    def _calculate_LRTs(self):
        """Run likelihood ratio test if there are enough results  """
        if all( [m in self.keys() for m in [1,2]] ):
            D = -2 * self[1].lnL + 2 * self[2].lnL
            pval = chisqprob(D,2)
            self.LRT_m1m2 = (D, pval)

        if all( [m in self.keys() for m in [7,8]] ):
            D = -2 * self[7].lnL + 2 * self[8].lnL
            pval = chisqprob(D,2)
            self.LRT_m7m8 = (D, pval)


 class PAMLParser(object):
    """Parse an entire CODEML result file

    Collects data from all models and trees analysed and runs LRTs for model
    comparions as well as collating residues found to be under selection using
    Bayes Emperical Bayes.

    Usage

    result_file = PAMLParser("my_run.out")
    result_file.write("my_run", LRT=True, residues=False)

    will write files:
    'my_run_LRTs.csv', 'my_run_res_m2.csv' and 'my_run_resm8.csv'
    """

    def __init__(self, fname):
        self.fname = fname
        self.read()


    def _get_residues(self, model):
        """ """
        d = defaultdict(list)
        for tree, mod in self.combined.items():
            for line in mod[model].residues:
                d[line[0]].append(line[2])
        return d


    def read(self):
        self.combined = defaultdict(PAMLTree)
        lnL_pattern = re.compile('np:\s+\d+\):\s+(-\d+\.\d+)')

        mods = open(self.fname).read().split("Model ")[1:]
        for m in mods:
            m_name = int(m.split(':', 1)[0])
            trees = m.split('TREE ')[1:]
            for t in trees:
                t_name = int(t.split(':')[0][-1])
                ln_match = lnL_pattern.search(t)
                t_ln = float(ln_match.group(1))
                if "Bayes Empirical Bayes" in t:
                    beb = t.split("Bayes Empirical Bayes (BEB) analysis")[1]
                    table = beb.split("SE for w")[1].split('The grid')[0]
                    #index, default, posterior w _ SE  for residues
                    residues = [tuple(l.split()) for l in table.split("\n") if l]
                else:
                    residues = None
                res = PAMLResult(lnL=t_ln, residues=residues)
                self.combined[t_name]._add_result(m_name, res)

        #all done, if there are any LRT-tests to run we can do them now
        [p._calculate_LRTs() for p in self.combined.values()]

    def write(self, file_stem, LRT = True, residues= True):
        """ """
        if LRT:
            out = open(file_stem + "_LRTs.csv", "w")
            out.write("tree,comp,D, p-val\n")
            t = 0
            for tree, results in self.combined.items():
                try:
                    D, pval = results.LRT_m1m2
                    out.write("{0}, m1am2a, {1}, {2}\n".format(
                                                      tree,round(D,3), pval))
                    t += 1
                except(AttributeError):
                    pass #no results for that comparisom

                try:
                    D, pval = results.LRT_m7m8
                    out.write("{0}, m7m8, {1}, {2}\n".format(
                                                     tree,round(D,3), pval))
                    t += 1
                except(AttributeError):
                    pass #no results for that comparisom

            out.close()
            print "wrote data for {0} LRTs".format(t)

        if residues:

            try:
                m8 = self._get_residues(8)
                out = open(file_stem +"_res_m8.csv","w")
                header = "res, " + ",".join(
                        ["t" + str(i) for i in range(1, len(m8.values()[1])+1)]
                )
                out.write(header + "\n")
                counter = 0
                for res, vals in m8.items():
                    line = "{0},{1}\n".format(
                                    res, ",".join(f.strip("*") for f in vals))
                    out.write(line)
                    counter += 1
                print "wrote data for {0} residues using model 8".format(counter)

            except KeyError:
                pass

            try:
                m2 = self._get_residues(2)
                out = open(file_stem +"_res_m2.csv","w")
                header = "res, " + ",".join(
                        ["t" + str(i) for i in range(1, len(m2.values()[1])+1)]
                )
                out.write(header + "\n")
                counter = 0
                for res, vals in m2.items():
                    line = "{0},{1}\n".format(
                                    res, ",".join(f.strip("*") for f in vals))
                    out.write(line)
                    counter += 1
                print "wrote data for {0} residues using model 2a".format(counter)

            except KeyError:
                pass #not finised/not results for this model


 def main():
    """Get the information form one file, given as a commandline argument """
    result = PAMLParser(sys.argv[1])
    result.write(sys.argv[1] + '_result')

 if __name__ == "__main__":
    main()
	#
	# Code to parse the interesting bits of a PAML results file that contains
	# statistics for several trees. Note, this approach contains some hacks that
	# are probably specific to multiple-tree CODEML files, Biopython has a module
	# for handling other PAML outputs, and it probably a better starting point for
	# 'normal' (single tree) input files.
	#
	# TODO - should clean up handlng of resifues - use @property decorator and
	# ._functions to simplify writing/reading.
	#


	import sys
	import re
	from collections import defaultdict
	from scipy.stats import chisqprob

	class PAMLResult(object):
	"""Represent the results from one PAMl model for one tree """

	def __init__(self, lnL, residues =None):
	self.lnL = lnL
	self.residues = residues

	def __repr__(self):
	return 'PAMLResult(lnL= {0})'.format(self.lnL)


	class PAMLTree(dict):
	"""Hold PAML restuls for one tree across several models"""

	def __init__(self, *args):
	dict.__init__(self, args)


	def _add_result(self, model, res):
	self[model] = res


	def _calculate_LRTs(self):
	"""Run likelihood ratio test if there are enough results """
	if all( [m in self.keys() for m in [1,2]] ):
	D = -2 * self[1].lnL + 2 * self[2].lnL
	pval = chisqprob(D,2)
	self.LRT_m1m2 = (D, pval)

	if all( [m in self.keys() for m in [7,8]] ):
	D = -2 * self[7].lnL + 2 * self[8].lnL
	pval = chisqprob(D,2)
	self.LRT_m7m8 = (D, pval)


	class PAMLParser(object):
	"""Parse an entire CODEML result file

	Collects data from all models and trees analysed and runs LRTs for model
	comparions as well as collating residues found to be under selection using
	Bayes Emperical Bayes.

	Usage

	result_file = PAMLParser("my_run.out")
	result_file.write("my_run", LRT=True, residues=False)

	will write files:
	'my_run_LRTs.csv', 'my_run_res_m2.csv' and 'my_run_resm8.csv'
	"""

	def __init__(self, fname):
	self.fname = fname
	self.read()


	def _get_residues(self, model):
	""" """
	d = defaultdict(list)
	for tree, mod in self.combined.items():
	for line in mod[model].residues:
	d[line[0]].append(line[2])
	return d


	def read(self):
	self.combined = defaultdict(PAMLTree)
	lnL_pattern = re.compile('np:\s+\d+\):\s+(-\d+\.\d+)')

	mods = open(self.fname).read().split("Model ")[1:]
	for m in mods:
	m_name = int(m.split(':', 1)[0])
	trees = m.split('TREE ')[1:]
	for t in trees:
	t_name = int(t.split(':')[0][-1])
	ln_match = lnL_pattern.search(t)
	t_ln = float(ln_match.group(1))
	if "Bayes Empirical Bayes" in t:
	beb = t.split("Bayes Empirical Bayes (BEB) analysis")[1]
	table = beb.split("SE for w")[1].split('The grid')[0]
	#index, default, posterior w _ SE for residues
	residues = [tuple(l.split()) for l in table.split("\n") if l]
	else:
	residues = None
	res = PAMLResult(lnL=t_ln, residues=residues)
	self.combined[t_name]._add_result(m_name, res)

	#all done, if there are any LRT-tests to run we can do them now
	[p._calculate_LRTs() for p in self.combined.values()]

	def write(self, file_stem, LRT = True, residues= True):
	""" """
	if LRT:
	out = open(file_stem + "_LRTs.csv", "w")
	out.write("tree,comp,D, p-val\n")
	t = 0
	for tree, results in self.combined.items():
	try:
	D, pval = results.LRT_m1m2
	out.write("{0}, m1am2a, {1}, {2}\n".format(
	tree,round(D,3), pval))
	t += 1
	except(AttributeError):
	pass #no results for that comparisom

	try:
	D, pval = results.LRT_m7m8
	out.write("{0}, m7m8, {1}, {2}\n".format(
	tree,round(D,3), pval))
	t += 1
	except(AttributeError):
	pass #no results for that comparisom

	out.close()
	print "wrote data for {0} LRTs".format(t)

	if residues:

	try:
	m8 = self._get_residues(8)
	out = open(file_stem +"_res_m8.csv","w")
	header = "res, " + ",".join(
	["t" + str(i) for i in range(1, len(m8.values()[1])+1)]
	)
	out.write(header + "\n")
	counter = 0
	for res, vals in m8.items():
	line = "{0},{1}\n".format(
	res, ",".join(f.strip("*") for f in vals))
	out.write(line)
	counter += 1
	print "wrote data for {0} residues using model 8".format(counter)

	except KeyError:
	pass

	try:
	m2 = self._get_residues(2)
	out = open(file_stem +"_res_m2.csv","w")
	header = "res, " + ",".join(
	["t" + str(i) for i in range(1, len(m2.values()[1])+1)]
	)
	out.write(header + "\n")
	counter = 0
	for res, vals in m2.items():
	line = "{0},{1}\n".format(
	res, ",".join(f.strip("*") for f in vals))
	out.write(line)
	counter += 1
	print "wrote data for {0} residues using model 2a".format(counter)

	except KeyError:
	pass #not finised/not results for this model


	def main():
	"""Get the information form one file, given as a commandline argument """
	result = PAMLParser(sys.argv[1])
	result.write(sys.argv[1] + '_result')

	if __name__ == "__main__":
	main()