Skip to content

Instantly share code, notes, and snippets.

@dwinter
Created June 1, 2012 10:37
Show Gist options
  • Save dwinter/2851069 to your computer and use it in GitHub Desktop.
Save dwinter/2851069 to your computer and use it in GitHub Desktop.
Parse results for a multi-tree PAML run
#
# Code to parse the interesting bits of a PAML results file that contains
# statistics for several trees. Note, this approach contains some hacks that
# are probably specific to multiple-tree CODEML files, Biopython has a module
# for handling other PAML outputs, and it probably a better starting point for
# 'normal' (single tree) input files.
#
# TODO - should clean up handlng of resifues - use @property decorator and
# ._functions to simplify writing/reading.
#
import sys
import re
from collections import defaultdict
from scipy.stats import chisqprob
class PAMLResult(object):
"""Represent the results from one PAMl model for one tree """
def __init__(self, lnL, residues =None):
self.lnL = lnL
self.residues = residues
def __repr__(self):
return 'PAMLResult(lnL= {0})'.format(self.lnL)
class PAMLTree(dict):
"""Hold PAML restuls for one tree across several models"""
def __init__(self, *args):
dict.__init__(self, args)
def _add_result(self, model, res):
self[model] = res
def _calculate_LRTs(self):
"""Run likelihood ratio test if there are enough results """
if all( [m in self.keys() for m in [1,2]] ):
D = -2 * self[1].lnL + 2 * self[2].lnL
pval = chisqprob(D,2)
self.LRT_m1m2 = (D, pval)
if all( [m in self.keys() for m in [7,8]] ):
D = -2 * self[7].lnL + 2 * self[8].lnL
pval = chisqprob(D,2)
self.LRT_m7m8 = (D, pval)
class PAMLParser(object):
"""Parse an entire CODEML result file
Collects data from all models and trees analysed and runs LRTs for model
comparions as well as collating residues found to be under selection using
Bayes Emperical Bayes.
Usage
result_file = PAMLParser("my_run.out")
result_file.write("my_run", LRT=True, residues=False)
will write files:
'my_run_LRTs.csv', 'my_run_res_m2.csv' and 'my_run_resm8.csv'
"""
def __init__(self, fname):
self.fname = fname
self.read()
def _get_residues(self, model):
""" """
d = defaultdict(list)
for tree, mod in self.combined.items():
for line in mod[model].residues:
d[line[0]].append(line[2])
return d
def read(self):
self.combined = defaultdict(PAMLTree)
lnL_pattern = re.compile('np:\s+\d+\):\s+(-\d+\.\d+)')
mods = open(self.fname).read().split("Model ")[1:]
for m in mods:
m_name = int(m.split(':', 1)[0])
trees = m.split('TREE ')[1:]
for t in trees:
t_name = int(t.split(':')[0][-1])
ln_match = lnL_pattern.search(t)
t_ln = float(ln_match.group(1))
if "Bayes Empirical Bayes" in t:
beb = t.split("Bayes Empirical Bayes (BEB) analysis")[1]
table = beb.split("SE for w")[1].split('The grid')[0]
#index, default, posterior w _ SE for residues
residues = [tuple(l.split()) for l in table.split("\n") if l]
else:
residues = None
res = PAMLResult(lnL=t_ln, residues=residues)
self.combined[t_name]._add_result(m_name, res)
#all done, if there are any LRT-tests to run we can do them now
[p._calculate_LRTs() for p in self.combined.values()]
def write(self, file_stem, LRT = True, residues= True):
""" """
if LRT:
out = open(file_stem + "_LRTs.csv", "w")
out.write("tree,comp,D, p-val\n")
t = 0
for tree, results in self.combined.items():
try:
D, pval = results.LRT_m1m2
out.write("{0}, m1am2a, {1}, {2}\n".format(
tree,round(D,3), pval))
t += 1
except(AttributeError):
pass #no results for that comparisom
try:
D, pval = results.LRT_m7m8
out.write("{0}, m7m8, {1}, {2}\n".format(
tree,round(D,3), pval))
t += 1
except(AttributeError):
pass #no results for that comparisom
out.close()
print "wrote data for {0} LRTs".format(t)
if residues:
try:
m8 = self._get_residues(8)
out = open(file_stem +"_res_m8.csv","w")
header = "res, " + ",".join(
["t" + str(i) for i in range(1, len(m8.values()[1])+1)]
)
out.write(header + "\n")
counter = 0
for res, vals in m8.items():
line = "{0},{1}\n".format(
res, ",".join(f.strip("*") for f in vals))
out.write(line)
counter += 1
print "wrote data for {0} residues using model 8".format(counter)
except KeyError:
pass
try:
m2 = self._get_residues(2)
out = open(file_stem +"_res_m2.csv","w")
header = "res, " + ",".join(
["t" + str(i) for i in range(1, len(m2.values()[1])+1)]
)
out.write(header + "\n")
counter = 0
for res, vals in m2.items():
line = "{0},{1}\n".format(
res, ",".join(f.strip("*") for f in vals))
out.write(line)
counter += 1
print "wrote data for {0} residues using model 2a".format(counter)
except KeyError:
pass #not finised/not results for this model
def main():
"""Get the information form one file, given as a commandline argument """
result = PAMLParser(sys.argv[1])
result.write(sys.argv[1] + '_result')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment