avrilcoghlan · April 15, 2018 10:14
diff --git a/comparaFamiliesAnalysis.py b/comparaFamiliesAnalysis.py
 #!/usr/bin/env python3
 #25-Feb-2015 dr7
 #Analysis suite for exploring Compara families. The purpose is to gather information about Compara trees to that we can mine this large set of information and select the most interesting families to study.

 import sys
 import os
 import re
 import gzip
 import random
 import pickle
 from numpy import mean, median, std #numpy.std gets the population standard deviation, not the sample standard deviation
 from glob import glob
 from pprint import pprint
 from time import clock
 from warnings import warn

 #dependency
 import findLastCommonAncestor #dr7 deployed and tested code

 from Bio import Phylo

 time0 = clock()

 class Run(object):

 	def __init__(self, comparaFamilies,familiesPerTaxonomyLevel,geneDuplicationsFolder,geneLossesFolder,speciesNameTag,geneIDMappingFolder, \
 		interproscanGFFFolder,branchLengths,removeOutgroupsFile,speciesTreeFile,verbose,outputFile,dumpFamilies,flagFamilyList,speciesClade, \
 		geneFilterList,dumpsFolder,nodeInfo,proteinLengthsFolder):

 		"""Initialization of all class variables. Explains data structures."""

 		#inputs / arguments
 		self.comparaFamiliesFile = open(comparaFamilies,"r")
 		self.familiesPerTaxonomyLevelFile = familiesPerTaxonomyLevel
 		self.speciesTreeFile = speciesTreeFile
 		self.geneDuplicationsFolder = geneDuplicationsFolder
 		self.geneLossesFolder = geneLossesFolder
 		self.speciesNameTagFile = open(speciesNameTag,"r")
 		self.geneIDMappingFolder = geneIDMappingFolder
 		self.interproscanGFFFolder = interproscanGFFFolder
 		self.branchLengthsFile = open(branchLengths,"r")
 		self.cladeFile = open(speciesClade,"r")
 		self.nodeInfoFile = open(nodeInfo,"r")

 		#optional inputs / arguments
 		self.verbose = verbose
 		self.outputFile = open(outputFile,"w")
 		self.removeOutgroupsFile = removeOutgroupsFile
 		self.dumpFamiliesFile = dumpFamilies
 		self.flagFamilyListFile = flagFamilyList
 		self.geneFilterListFile = geneFilterList
 		self.dumpsFolder = dumpsFolder
 		self.proteinLengthsFolder = proteinLengthsFolder

 		#hard-coded 'arguments'
 		self.jobID = str(random.randint(1,100000))
 		self.targetSources = ["Pfam","TMHMM","SignalP_EUK"] #the types of GFF source entries that we are interested in storing in memory. #this is kept here as it may be modified, but not by user.
 		self.freeLivingSpecies = ["caenorhabditis_elegans","panagrellus_redivivus","rhabditophanes_kr3021","pristionchus_pacificus","schmidtea_mediterranea"] # 50HG Species that are completely free-living, so that I can distinguish parasitic-specific gene families. #This is just for helminth species, other non-helminth outgroups would be free-living too.

 		#data structures #main connector is familyID or species
 		self.familiesDict = {} #key -> familyID, val -> dict: key -> species, val -> list of genes
 		self.familiesTaxon = {} #key -> familyID, val -> taxonomy level
 		self.numTaxonBelow = {} #key -> taxon ID, val -> number species below
 		self.familiesGeneDuplication = {} #key -> familyID, val -> sum of duplications on throughout all species nodes
 		self.familiesGeneDuplicationMaxNode = {} #key -> familyID, val -> node ID with most duplication events
 		self.familiesGeneLosses = {} #key -> familyID, val -> sum of losses on throughout all species nodes
 		self.tagSpeciesName = {} #key -> species_tag, val -> species_name #this is to connect species to interproscan files
 		self.geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
 		self.geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs
 		self.interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source
 		self.branchLengths = {} #key -> familyID, val -> average branch length in family
 		self.cladePerSpecies = {} #key -> species, val -> clade
 		self.speciesPerClade = {} #key -> clade, value -> number of species in the clade (based on the species tree)
 		self.nodeInfo = {} #key -> node ID, list of clades potentially present under that node. e.g. 'bilateria40001012': {'I', 'IV', 'Flatworm', 'Flukes', 'Tapeworm', 'V', 'III'}
 		self.nodeInfoSpp = {} #key -> node ID, list of species present under that node.
 		self.proteinLengths = {} #key -> species, val -> dict ; key -> gene, val -> protein length in aa
 		self.familiesDictLengths = {} #key -> familyID, val -> dict: key -> species, val -> sum of protein length of proteins in that family and species.

 		#optional data structures
 		self.speciesTree = {} #key -> parent, value -> child. Done from a newick tree with external script "findLastCommonAncestor"
 		self.removeOutgroups = [] #list of node/leaf ids + species/node-name to remove from analysis
 		self.speciesToRemove = [] #list of species to remove from analysis #to apply when processing families.txt
 		self.familiesToFlag = [] #list of family IDs of families to have 'exclude' flag
 		self.geneFilterList = [] #list of gene IDs to filter out
 		self.listOfSpecies = [] #list of species we process here #based on the species tree!

 		##Notes:
 		#The species name should be lower case and with underscore e.g. panagrellus_redivivus
 		#The family IDs should be an int
 		#geneFilterListFile: added gene filtering into the readFamily step. Most measures will be (re)calculated in this script itself (e.g. gene tree root is recalculated). Note though that the following measures won't be correct/updated if gene filtering: Total_duplications, Total_losses, Max_duplications_node, Median_branch_length. These are external measures and calculated only before the gene filtering.


 	def readFamilies(self):
 		"""Parse compara families file. This file can contain all species and genes (i.e. can be Compara families raw file). 
 		Also does the species filtering (i.e. only counts genes in species that are not listed to remove).
 		Also does gene filtering, if given a list of genes to discard.
 		Only retains families that have at least 2 genes after filtering.
 		Also uses protein lengths instead of number of genes. instead of counting number of genes it sums up protein length of gene. (e.g. a species, instead of having 10 genes in that family has 3000 aa).
 		E.g. input file line: family 3 : SSTP_0000649300.1 (strongyloides_stercoralis) DME_0000314201-mRNA-1 (dracunculus_medinensis) TELCIR_18205 (teladorsagia_circumcincta) HPBE_0001583601-mRNA-1 (heligmosomoides_bakeri) """

 		if self.verbose: print ("Reading compara families files..")

 		comparaFamiliesFile = self.comparaFamiliesFile
 		speciesToRemove = self.speciesToRemove
 		geneFilterList = self.geneFilterList
 		proteinLengths = self.proteinLengths

 		familiesDict = {} 
 		familiesDictLengths = {}

 		for line in comparaFamiliesFile: #for each family
 			if "----------" not in line:
 				line = line.strip()
 				spl = line.split(":",1) #doing only max of one split, this is because some IDs also contain ":"
 				assert (len(spl) == 2)

 				familyID = int(spl[0].split(" ")[1])

 				speciesDict = {} #key -> sp, val -> list of genes
 				speciesDictLengths = {} #same as previous but stores cumulative lengths of genes of a species as an integer

 				genes = spl[1].split(" ")[1:]

 				for i in range(0,len(genes),2):
 					gene,sp = genes[i],genes[i+1]

 					if gene not in geneFilterList: #otherwise it is ignored, as if not existing. After this there should be a condition excluding potentially empty families.
 						sp = sp.replace("(","").replace(")","")

 						if sp not in speciesToRemove: #validated that this works well by looking at a couple of families
 							if sp not in speciesDict:
 								speciesDict[sp] = []
 								speciesDictLengths[sp] = 0

 							speciesDict[sp].append(gene)
 							
 							#for protein lengths instead of gene numbers
 							proteinLength = int(proteinLengths[sp][gene] )
 							speciesDictLengths[sp]+= proteinLength


 				lens = [len(speciesDict[sp]) for sp in speciesDict] #numbers of genes
 				sumLens = sum(lens)

 				if len(speciesDict) > 0 and sumLens > 1: #and sum(list(speciesDict.values()) ) > 1: #there can be empty families, if all species of that tree are to be removed #also there could be single gene 'families' after filtering.
 					familiesDict[familyID] = speciesDict
 					familiesDictLengths[familyID] = speciesDictLengths

 		if self.verbose:	print ("Total number compara families (from families.txt) to be processed, after filtering:\t%s" % len(familiesDict)) #this number does not have to match the total number of families, in case species are being removed

 		assert (len(familiesDict) == len(familiesDictLengths))

 		self.familiesDict = familiesDict
 		self.familiesDictLengths = familiesDictLengths


 	def readFamiliesPerTaxonomyLevel(self):
 		"""Reads families per taxonomy level. Note that, if the families.txt is filterd in this script, the input for this function may be a temporary file (corrected for new families) created in this script itself. 
 		E.g. Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819,1081886,1082962,1090840,1117204,1117804,1118812"""

 		if self.verbose: print ("Reading %s file.." % self.familiesPerTaxonomyLevelFile)

 		familiesPerTaxonomyLevelFile = open(self.familiesPerTaxonomyLevelFile,"r")

 		familiesTaxon = {}

 		for line in familiesPerTaxonomyLevelFile:
 			line = line.strip()

 			taxon,families = line.split(":")

 			taxon = taxon.lower().strip().split(" ")  #may need lower()
 			taxonNum = taxon[-1]

 			taxon = "_".join(taxon[:-1])+taxonNum  #e.g. Nippostrongylus_brasiliensis40001108

 			families = families.strip().split(",")

 			for family in families:
 				family = int(family)

 				if family not in familiesTaxon:
 					familiesTaxon[family] = taxon
 				else:
 					print ("Two roots for family?",family)


 		if self.verbose:	print ("Total families in families_per_taxonomy_level:\t%s" % len(familiesTaxon))

 		self.familiesTaxon = familiesTaxon


 	def _readTaxonFiles(self,inFolder):
 		"""E.g filename: echinostomatoidea40001020.txt. Inside file: 66838   2    (familyID\tNum_Events). 
 		What I want is to combine all events for each family."""

 		removeOutgroups = self.removeOutgroups

 		outDict = {} #key -> family ID, val -> tot events
 		outDictNode = {} #key -> family ID, val -> dict; key -> nodeID, val -> tot events #to have node with most events

 		filesToProcess = glob(inFolder+"/*.txt")

 		for fi in filesToProcess:
 			fileName = fi.split("/")[-1].replace(".txt","")

 			if fileName in removeOutgroups: #exclude data from nodes/species that we want to exclude
 				continue

 			# #checking if taxon IDs match between files
 			# print (self.numTaxonBelow[fileName])

 			f = open(fi,"r")

 			for line in f:
 				if "#" in line:
 					continue
 				line = line.strip()
 				spl = line.split("\t")
 				familyID = int(spl[0])
 				events = int(spl[1])

 				if familyID not in outDict:
 					outDict[familyID] = 0
 					outDictNode[familyID] = {}

 				if fileName not in outDictNode[familyID]:
 					outDictNode[familyID][fileName] = 0

 				outDict[familyID]+=events
 				outDictNode[familyID][fileName]+=events


 			f.close()

 		return outDict,outDictNode


 	def readGeneDuplicationsFolder(self):
 		"""Process all files in Gene Duplications folder"""

 		if self.verbose: print ("Reading gene duplication files..")

 		geneDuplicationsFolder = self.geneDuplicationsFolder

 		run = self._readTaxonFiles(geneDuplicationsFolder)
 		familiesGeneDuplication = run[0]
 		nodeDict = run[1]

 		#getting most frequent node
 		familiesGeneDuplicationMaxNode = {}
 		for familyID in nodeDict:
 			maxVal = 0
 			maxNode = ""
 			if familyID not in familiesGeneDuplicationMaxNode:
 				familiesGeneDuplicationMaxNode[familyID] = ""
 			for node in nodeDict[familyID]:
 				val = nodeDict[familyID][node]
 				if val > maxVal:
 					maxVal = val
 					maxNode = node

 				familiesGeneDuplicationMaxNode[familyID] = str(maxVal)+"X:"+maxNode

 		##validation
 		# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ g 1178810 *
 		# crassostrea_gigas40001175.txt:1178810	1
 		# schmidtea_mediterranea40001061.txt:1178810	2

 		# Family 126577: '37X:schmidtea_mediterranea40001061'
 		# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ grep 126577 *
 		# ...
 		# ascaridoidea40001165.txt:126577	4
 		# bilateria40001006.txt:126577	12
 		# ...
 		# schmidtea_mediterranea40001061.txt:126577	37
 		# ...

 		if self.verbose:	print ("Total families with gene duplication info:\t%s" % len(familiesGeneDuplication))

 		self.familiesGeneDuplication = familiesGeneDuplication
 		self.familiesGeneDuplicationMaxNode = familiesGeneDuplicationMaxNode



 	def readGeneLossesFolder(self):
 		"""Process all files in Gene Losses folder"""

 		if self.verbose: print ("Reading gene loss files..")

 		geneLossesFolder = self.geneLossesFolder

 		familiesGeneLosses = self._readTaxonFiles(geneLossesFolder)[0]

 		#Validation: 1135828: 6
 		# dr7@pcs5b:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_losses$ g 1135828 *
 		# chromadorea40001076.txt:1135828	1
 		# platyhelminthes40001014.txt:1135828	1
 		# romanomermis_culicivorax40001064.txt:1135828	1
 		# soboliphyme_baturini40001066.txt:1135828	1
 		# trichuris_muris40001075.txt:1135828	1
 		# trichuris_trichiura40001074.txt:1135828	1

 		if self.verbose:	print ("Total families with gene losses info:\t%s" % len(familiesGeneLosses))

 		self.familiesGeneLosses = familiesGeneLosses


 	def readSpeciesNameTag(self):
 		"""Reads tag-species_name correspondence. E.g. PREFIX  Genus   species VERSION
 		ACAC    Angiostrongylus cantonensis     1.5.4"""

 		if self.verbose: print ("Reading species name mapping files..")

 		speciesNameTagFile = self.speciesNameTagFile

 		tagSpeciesName = {}

 		for line in speciesNameTagFile:
 			if line.startswith('"This') or line.startswith("PREFIX") or "#" in line:
 				continue
 			line = line.strip()
 			spl = line.split("\t")

 			tag = spl[0]
 			sp = spl[1].lower().strip()+"_"+spl[2].strip() #strip because sometimes there is some empty spaces

 			tagSpeciesName[tag] = sp


 		if self.verbose:	print ("Total species with locus tag info:\t%s" % len(tagSpeciesName))

 		self.tagSpeciesName = tagSpeciesName


 	def readGeneIDMappingFolder(self):
 		"""Reads Gene ID mapping files
 		E.g. acanthocheilonema_viteae	nAv.1.0.1.g00113	nAv.1.0.1.t00113-RA	nAv.1.0.1.t00113-RA	nAv.1.0.1.t00113-RA	nAvx1x0x1xt00113-RA"""

 		if self.verbose: print ("Reading id mapping files..")

 		geneIDMappingFolder = self.geneIDMappingFolder

 		geneIDMapping = {}  #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt
 		geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs

 		#Note: not all genes are present in the interproscan files, so the total number of genes for a species here is not representative of the full dataset.

 		filesToProcess = glob(geneIDMappingFolder+"/*_id_mapping.txt")

 		for fi in filesToProcess:
 			f = open(fi,"r")

 			fileName = fi.split("/")[-1].replace("_id_mapping.txt","")

 			if fileName not in geneIDMapping:
 				geneIDMapping[fileName] = {}

 			for line in f:
 				if line.startswith("SPECIES_NAME"):
 					continue
 				line = line.strip()
 				spl = line.split("\t")
 				iprID = spl[5]
 				mRNA = spl[2]
 				gene = spl[1]

 				geneIDMappingAllEntries[mRNA] = spl[1:]

 				if iprID != "NA":
 					if iprID not in geneIDMapping[fileName]:
 						geneIDMapping[fileName][iprID] = []
 					geneIDMapping[fileName][iprID] = spl[1:-1] #first column is species name

 			f.close()

 		if self.verbose:	print ("Total species with id mapping info:\t%s" % (len(geneIDMapping)) )
 		if self.verbose:	print ("Total genes with id mapping info:\t%s" % (len(geneIDMappingAllEntries)) )

 		self.geneIDMapping = geneIDMapping
 		self.geneIDMappingAllEntries = geneIDMappingAllEntries


 	def readInterproGFFFolder(self):
 		"""Reads interproscan GFF files for all species. The files can be gzipped."""

 		if self.verbose: print ("Reading interproscan files..")

 		interproscanGFFFolder = self.interproscanGFFFolder
 		tagSpeciesName = self.tagSpeciesName
 		geneIDMapping = self.geneIDMapping

 		interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source

 		filesToProcess = glob(interproscanGFFFolder+"/*ipr*")

 		IDsNotFound = {}

 		for fi in filesToProcess:
 			if fi.endswith(".gz"):
                # f = gzip.open(fi, 'rb').read().decode('ascii').split("\n") #this might be memory hungry
 				f = gzip.open(fi, 'rb').read().decode('utf-8').split("\n") #this might be memory hungry         # bh4 changed this to utf-8 because ascii fails for the new ipr file for T.regenti
 			else:
 				f = open(fi,"r")
 			##e.g. file name /nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz

 			tag = fi.split("/")[-1].split(".")[0]
 			species = tagSpeciesName[tag]

 			if species not in interproscanData:
 				interproscanData[species] = {}

 			if species not in geneIDMapping:
 				warn("Warning: %s not in geneIDMapping" % species)

 			for line in f:
 				if "##FASTA" in line:
 					break #I do not want to read the fasta part of GFF. #I assume that after this line appears there will be no more GFF/Interproscan entries
 				if "##" in line:
 					continue
 				line = line.strip()

 				spl = line.split("\t")

 				if len(spl) < 8: #means is not proper GFF line
 					continue

 				source = spl[1]

 				if source not in self.targetSources:
 					continue

 				iprID = spl[0]

 				# if iprID not in geneIDMapping[species]:
 				# 	if species not in IDsNotFound:
 				# 		IDsNotFound[species] = set()
 				# 	IDsNotFound[species].add(iprID)

 				if iprID in geneIDMapping[species]: #It can happen that a gene is not present in geneIDMapping, this is because some interproscan files are not updated, containing genes that have since been removed for compara.
 					geneID = geneIDMapping[species][iprID][0]
 				# else:
 				# 	warn("IPR ID not in gene mapping file.")

 					if geneID not in interproscanData[species]:
 						interproscanData[species][geneID] = {}

 					if source not in interproscanData[species][geneID]:
 						interproscanData[species][geneID][source] = []

 					interproscanData[species][geneID][source].append(line)

 				#pprint(interproscanData[species])

 				#Validation, number of Pfam domains in file
 				#/nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz 13886
 				# dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs$ grep Pfam ACOC.protein.fa.gz.fas.ipr | wc -l
 				# 13886


 		# print ("IDs on IPR GFF but not on ID Mapping file!",len(IDsNotFound))
 		# for sp in IDsNotFound:
 		# 	print (sp,len(list(IDsNotFound[sp]))
 		# 	print (IDsNotFound[sp])


 		if self.verbose: print ("Total species with interproscan data:\t%s" % len(interproscanData) )

 		self.interproscanData = interproscanData


 	def readBranchLengths(self):
 		"""Reads branch lengths file. Grabs median.
 		E.g. 22454   0.125   0.081   0.204589,0.263049,0.078644,0.113182,0.282062,0.03627,0.105361,0.177002,0.089029,0.129357,0.076138,0.102818"""

 		if self.verbose: print ("Reading family branch lengths file..")

 		branchLengthsFile = self.branchLengthsFile

 		branchLengths = {}

 		for line in branchLengthsFile:
 			line = line.strip()
 			familyID,mean,median,lista = line.split("\t")
 			familyID = int(familyID)

 			if familyID not in branchLengths:
 				branchLengths[familyID] = 0
 			branchLengths[familyID] = float(median)

 		if self.verbose: print ("Total families with branch lengths:\t%s" % len(branchLengths) )

 		self.branchLengths = branchLengths


 	def readOutgroupList(self):
 		"""Reads file with outgroup / node ids to exclude from analysis. ID\tspecies/node-name\twhether_is_species_or_node
 		e.g. 40001001        amphimedon_queenslandica        species"""

 		if self.verbose: print ("Reading file with outgroups to remove..")

 		removeOutgroupsFile = open(self.removeOutgroupsFile,"r")

 		removeOutgroups = []
 		speciesToRemove = []

 		for line in removeOutgroupsFile:
 			line = line.strip()
 			spl = line.split("\t")
 			removeOutgroups.append(spl[1]+spl[0])

 			if spl[2] == "species":
 				speciesToRemove.append(spl[1])

 		if self.verbose: print ("Total outgroup nodes/leaves to be removed:\t%s" % len(removeOutgroups) )
 		if self.verbose: print ("Total species (leaves) to be removed:\t%s" % len(speciesToRemove) )

 		self.speciesToRemove = speciesToRemove
 		self.removeOutgroups = removeOutgroups
 	

 	def readSpeciesTree(self):
 		"""Reads newick species tree. Node/leaf names must have at least one alphabethic character, otherwise if only numeric Phylo will read names as being branch lengths. Branch lengths are not required.
 		This outputs a tree with parent-child relationships. Will only work for simple binary trees.
 		Also returns number taxons below each node based on the tree."""

 		if self.verbose: print ("Reading species tree..")

 		speciesTreeFile = self.speciesTreeFile

 		numTaxonBelow = {}
 		sppTaxonBelow = {}

 		tree = Phylo.read(speciesTreeFile, 'newick')
 		speciesTree = findLastCommonAncestor.convertTreeToDict(tree) #in python dictionary format

 		reverseDict = findLastCommonAncestor.reverseDict(speciesTree)

 		allNodes = list(speciesTree.keys()) 
 		allNodes.extend(list(speciesTree.values()) )
 		allNodes = set(allNodes)

 		for node in allNodes:
 			numTaxonBelow[node] = int(findLastCommonAncestor.countLeaves(reverseDict,node))

 		if self.verbose: print ("Total nodes/leaves in species tree:\t%s" % len(speciesTree) )
 		if self.verbose: print ("Total nodes/leaves with number of taxon below:\t%s" % len(numTaxonBelow) )

 		self.numTaxonBelow = numTaxonBelow
 		self.speciesTree = speciesTree
 		leaves = [leaf.name for leaf in tree.get_terminals()]
 		listOfSpecies = []
 		for leaf in leaves:
 			listOfSpecies.append(leaf.split("400")[0])

 		self.listOfSpecies = listOfSpecies


 	def writeFamiliesPerTaxonomyLevel(self):
 		"""If filtering out some species/nodes, I need to calculate new root for these altered compara families, and create an updated familiesPerTaxonomyLevel. The root (branchName) will then be taken from this file.
 		This has to be run after readFamilies. The external 'findLastCommonAncestor' functions have been tested previously.
 		This function has been properly validated: the created file is the same as the one from Compara, if not removing any outgroups."""

 		#e.g.
 		#Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819
 		#Strongyloides 40001088 : 269778,518849,529306,576845,647355,668201,705124,735608

 		if self.verbose: print ("Creating familiesPerTaxonomyLevel.txt with recalculated roots..")

 		familiesDict = self.familiesDict
 		speciesTree = self.speciesTree

 		tmpFile = open("tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt","w")
 		
 		speciesKeys = list(speciesTree.keys())

 		familiesTaxonLevel = {} #this is temporary structure to store what will be written in tmp file

 		#loop through each family, calculate new root and write in file.
 		for familyID in familiesDict:
 			species = list(familiesDict[familyID].keys()) #these are e.g. clonorchis_sinensis, but we need to match to speciesTree clonorchis_sinensis40001019

 			speciesKey = ""
 			renamedSpecies = []
 			for sp in species:
 				for key in speciesKeys:
 					if sp in key:
 						renamedSpecies.append(key)

 			assert (len(species) == len(renamedSpecies)) #otherwise species-tree species names do not match families.txt species names..

 			rootNode = findLastCommonAncestor.run(renamedSpecies,speciesTree)

 			#rename rootNode so that it resembles the same as in familiesPerTaxonomyLevel.
 			#This bit may be particular to 50HG Compara. But it should not matter if other database/ID type, simply the file with look differently.
 			if "4000" in rootNode:
 				spl = rootNode.partition("4000")
 				name = spl[0]
 				ID = "".join(spl[1:])

 				if "_" in name:
 					name = name.replace("_"," ")
 				name = name[0].upper()+name[1:]

 				rootNode = name+" "+ID

 			if rootNode not in familiesTaxonLevel:
 				familiesTaxonLevel[rootNode] = []

 			familiesTaxonLevel[rootNode].append(str(familyID) )


 		#write to new file
 		for taxon in familiesTaxonLevel:
 			tmpFile.write(taxon+" : "+",".join(familiesTaxonLevel[taxon])+"\n")

 		tmpFile.close()

 		if self.verbose: print ("Total number of node/species in new familiesPerTaxonomyLevel file:\t%s" % len(familiesTaxonLevel)) #this should be equal to: total nodes+species in species tree (180 for 50HG) minus the removed ones.

 		self.familiesPerTaxonomyLevelFile = tmpFile.name #change familiesPerTaxonomyLevelFile file pointer to newly created file


 	def readFlagFamilyList(self):
 		"""Read file with list of families to be flagged."""

 		if self.verbose: print ("Reading file with families to flag with 'exclude'..")

 		flagFamilyListFile = open(self.flagFamilyListFile,"r")

 		familiesToFlag = [int(familyID.strip()) for familyID in flagFamilyListFile]

 		self.familiesToFlag = familiesToFlag

 		if self.verbose: print ("Total number of families to be flagged: %s" % (len(familiesToFlag)) )


 	def readGeneFilterList(self):
 		"""Read file with list of genes to be excluded from measures."""

 		if self.verbose: print ("Reading file with genes to exclude..")

 		geneIDMappingAllEntries = self.geneIDMappingAllEntries #mRNA ID to other IDs

 		geneFilterListFile = open(self.geneFilterListFile,"r")

 		#convert mRNA ID to gene ID, to match families
 		geneFilterList = {geneIDMappingAllEntries[geneID.strip()][0] for geneID in geneFilterListFile}

 		self.geneFilterList = geneFilterList

 		if self.verbose: print ("Total number of genes to be excluded: %s" % (len(geneFilterList)) )


 	def readCladeFile(self):
 		"""Read clade file, there must be a clade per each species"""

 		if self.verbose: print ("Reading clade file..")

 		cladeFile = self.cladeFile
 		listOfSpecies = self.listOfSpecies

 		cladePerSpecies = {} #key -> species, val -> clade
 		speciesPerClade = {} #key -> clade, value -> species in the clade (based on the species tree)

 		for line in cladeFile:
 			line = line.strip()
 			spl = line.split("\t")
 			sp = spl[0]
 			clade = spl[1]
 			sp = sp.lower()
 			sp = sp.replace(" ","_")

 			if sp in listOfSpecies:
 				cladePerSpecies[sp] = clade
 				if clade not in speciesPerClade:
 					speciesPerClade[clade] = []
 				speciesPerClade[clade].append(sp)

 		self.cladePerSpecies = cladePerSpecies
 		self.speciesPerClade = speciesPerClade

 		if self.verbose: print ("Total number of species in clade file: %s" % (len(cladePerSpecies)) )


 	def readNodeInfoFile(self):
 		"""Reads file with information on leaves for each node. E.g.
 		node amphimedon_queenslandica40001001 has parent metazoa40001000
 		node euteleostomi40001177 has descendants danio_rerio40001178,homo_sapiens40001179
 		Retrieves the set of Clades present for each node. All species of that clade need to be present for a clade to be considered (e.g. paraphyletic)."""
 		
 		if self.verbose: print ("Reading node info file..")

 		nodeInfoFile = self.nodeInfoFile
 		speciesPerClade = self.speciesPerClade
 		cladePerSpecies = self.cladePerSpecies

 		nodeInfo = {} #key -> node ID, list of clades potentially present under that node.
 		nodeInfoSpp = {} #key -> node ID, list of species present under that node.

 		for line in nodeInfoFile:
 			line = line.strip()
 			if "descendants" in line:
 				spl = line.split(" ")
 				nodeID = spl[1]
 				descendants = spl[4].split(",")

 				cladeList = []
 				for des in descendants:
 					des = des.split("400")[0]
 					cladeList.append(cladePerSpecies[des])

 				descendantClades = set()
 				for clade in speciesPerClade:
 					if cladeList.count(clade) == len(speciesPerClade[clade]): #to make sure the whole clade is complete. I should only count a descendant clade if all species are present
 						descendantClades.add(clade)

 				nodeInfo[nodeID] = sorted(list(descendantClades))
 				nodeInfoSpp[nodeID] = [des.split("400")[0] for des in descendants]

 		self.nodeInfo = nodeInfo
 		self.nodeInfoSpp = nodeInfoSpp

 		if self.verbose: print ("Total number of tree nodes with leaf info: %s" % (len(nodeInfo)) )


 	def readProteinLengthFiles(self):
 		"""Read list of all genes and their length"""

 		if self.verbose: print ("Reading protein lengths folder..")

 		proteinLengthsFolder = self.proteinLengthsFolder

 		proteinLengths = {} #key -> species, val -> dict. Key -> gene id, val -> length

 		filesToProcess = glob(proteinLengthsFolder+"/*_protein_lengths.txt")

 		count = 0
 		for f in filesToProcess:
 			inFile = open(f,"r")
 			spName = f.split("/")[-1].replace("_protein_lengths.txt","")

 			if spName not in proteinLengths:
 				proteinLengths[spName] = {}

 			for line in inFile:
 				gene,geneLength = line.strip().split("\t")

 				proteinLengths[spName][gene] = geneLength

 				count+=1

 			inFile.close()

 		self.proteinLengths = proteinLengths

 		if self.verbose: print ("Total number of species with protein lengths: %s and total genes: %s" % (len(proteinLengths),count) )


 	def statsPerFamily(self):
 		"""Main function to write stats for each family"""

 		if self.verbose: print ("Making stats..")

 		#1) all global data structures in use #just wrote this to easier localization and make sure I don't change the values
 		familiesDict = self.familiesDict
 		familiesTaxon = self.familiesTaxon
 		numTaxonBelow = self.numTaxonBelow
 		familiesGeneDuplication = self.familiesGeneDuplication 
 		familiesGeneDuplicationMaxNode = self.familiesGeneDuplicationMaxNode
 		familiesGeneLosses = self.familiesGeneLosses
 		interproscanData = self.interproscanData
 		freeLivingSpecies = self.freeLivingSpecies
 		branchLengths = self.branchLengths
 		familiesToFlag = self.familiesToFlag
 		cladePerSpecies = self.cladePerSpecies
 		speciesPerClade = self.speciesPerClade
 		listOfSpecies = self.listOfSpecies
 		nodeInfo = self.nodeInfo
 		nodeInfoSpp = self.nodeInfoSpp
 		familiesDictLengths = self.familiesDictLengths

 		#2) output file headers
 		self.outputFile.write("familyID\tFlag\tn_species\tn_genes\tMean_genes_per_species\tMedian_genes_per_species\tVariation_coefficient_n_genes_per_species\tn_paralogs\tBranch_name\tCompleteness_score\tTotal_duplications\tMax_duplications_node\tMost_frequent_species\t")
 		for species in sorted(freeLivingSpecies):
 			self.outputFile.write(species+"\t")
 		self.outputFile.write("Total_losses\tMedian_branch_length\tPfam_perc_genes\tTMHMM_perc_genes\tSignalP_perc_genes\tPfam_perc_in_family\tPfam_all_in_family\n")

 		outputFileMeasures = open(self.outputFile.name+"_measures","w") #special file just for the measures
 		outputFileMeasures.write("familyID\tFlag\tn_species\tn_genes\tmean_prot_len\tsum_prot_len\tSpecies_var_coef\tSpecies_var_coef_zeroes\tVar_coef_lengths_zeroes\t")
 		for clade in sorted(speciesPerClade):
 			outputFileMeasures.write("zscore_%s\t" % clade)
 		outputFileMeasures.write("Max_zscore_clade\tMax_zscore\t")
 		for clade in sorted(speciesPerClade):
 			outputFileMeasures.write("enrich_%s\t" % (clade) )
 		outputFileMeasures.write("Max_enrich_clade\tMax_enrich\t")
 		outputFileMeasures.write("Pfam_perc_in_family\tPfam_all_in_family\n")

 		#3) Loop through each compara family, and produce all stats for this family
 		count = 0 #count processed families
 		for family in familiesDict:
 			count+=1

 			#3.1) General basic stats
 			nSpecies = len(familiesDict[family]) #note that this familiesDict only contain species entry if any gene in that family (does not count zeroes)
 			nGenesPerSpecies = [len(familiesDict[family][sp]) for sp in familiesDict[family]]
 			proteinLengthsSpecies = [familiesDictLengths[family][sp] for sp in familiesDictLengths[family]] #this is list of protein lenghts sums

 			nGenes = sum(nGenesPerSpecies)
 			sumProteinLengths = sum(proteinLengthsSpecies)

 			avgProteinLenPerGene = "%.1f" % (sumProteinLengths/nGenes)

 			nParalogs = nGenes - nSpecies
 			avgGenesPerSpecies = "%.1f" % (mean(nGenesPerSpecies))
 			medianGenesPerSpecies = "%.1f" % (median(nGenesPerSpecies))
 			stdevGenesPerSpecies = std(nGenesPerSpecies,ddof=1)
 			# 21-Aug-2015: Variation coefficient = Stdev of number of genes in species with at least one gene / mean of number of genes in species with at least one gene
 			# this excludes the zeroes (species without any gene in that family)
 			stdevDivMean = "%.1f" % (stdevGenesPerSpecies/float(avgGenesPerSpecies)) # Variations in numbers of genes per family across species -> stdev / mean

 			#3.2) tag families to exclude
 			if family in familiesToFlag:
 				familyFlag = "exclude"
 			else:
 				familyFlag = ""

 			#3.3) Calculate/retrieve Branch name, median branch length and completeness score
 			branchName = familiesTaxon[family]
 			if family in branchLengths:
 				medianBranchLength = branchLengths[family]
 			else: #this should not happen except for a couple of known exceptions e.g. family 1109776 when only 33 species, because pruning did not work. #as I do not know if there are/will be other exceptions I decided to put "NA" when it does not work.
 				medianBranchLength = "NA"
 			nSpeciesBelowTaxon = numTaxonBelow[branchName]
 			completeness = "%.2f" % (nSpecies/nSpeciesBelowTaxon)
 			#Validation completeness for family: 80624. On James plot shows 0.5 completeness score. VERIFIED 80624 41 319 7.8 2.0 278 Metazoa40001000 0.5
 			#Validation completeness for family: 63382. On James plot shows <0.25 completeness score. VERIFIED 63382 10 393 39.3 36.5 383 Chromadorea40001076 0.2

 			#3.4) recalculate species variation coefficient, having insight from phylogeny.
 			#filling the zeroes when a species should have a gene (based on tree root) but does not 
 			zeroesMissing = nSpeciesBelowTaxon - nSpecies #species that should have a gene but don't, based on the family species root
 			assert(zeroesMissing >= 0)
 			nGenesPerSpeciesZeroes = nGenesPerSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
 			for i in range(zeroesMissing):
 				nGenesPerSpeciesZeroes.append(0)

 			stdnGenesPerSpeciesZeroes = std(nGenesPerSpeciesZeroes,ddof=1)
 			meannGenesPerSpeciesZeroes = mean(nGenesPerSpeciesZeroes)
 			speciesVarCoefZeroes = "%.1f" % (stdnGenesPerSpeciesZeroes / meannGenesPerSpeciesZeroes )

 			#variation coefficient for protein lengths
 			proteinLengthsSpeciesZeroes = proteinLengthsSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:]
 			for i in range(zeroesMissing):
 				proteinLengthsSpeciesZeroes.append(0)

 			stdProteinLengthsSpeciesZeroes = std(proteinLengthsSpeciesZeroes,ddof=1)
 			meanProteinLengthsSpeciesZeroes = mean(proteinLengthsSpeciesZeroes)
 			speciesVarCoefZeroesLengths = "%.1f" % (stdProteinLengthsSpeciesZeroes / meanProteinLengthsSpeciesZeroes )

 			#3.5) retrieve gene losses and gene duplication data
 			if family in familiesGeneDuplication:
 				duplications = familiesGeneDuplication[family]
 			else:
 				duplications = 0

 			if family in familiesGeneLosses:
 				losses = familiesGeneLosses[family]
 			else:
 				losses = 0

 			if family in familiesGeneDuplicationMaxNode:
 				maxDuplicationNode = familiesGeneDuplicationMaxNode[family]
 			else:
 				maxDuplicationNode = "None"
 			
 			#initialize interproscan-related items and others
 			nGenesWithPfam = 0 #at least 1 Pfam entry
 			nGenesWithSignalP = 0 #at least 1 SignalP entry
 			nGenesWithTMHMM = 0 #at least 1 TMHMM entry
 			pfamTerms = [] # this is to store one pfam domain example per gene
 			pfamTermsAll = [] # this is to store all pfam domains in a gene (not just one per gene)

 			maxSp = "" #the species with most number of genes
 			maxSpProtLen = 0

 			protLenPerClade = {} #key -> clade, val ->  list of protein lengths in species of that clade . note that I only initialize a clade if any species on that clade, and only contains data on species with at least one gene.
 			protLenPerSp = {} #key -> sp, val -> prot length
 			#3.6) Loop through each species in gene family. To retrieve interproscan info and make other calculations
 			for species in sorted(familiesDict[family]):
 				#ngenesSp = len(familiesDict[family][species]) #if calculating measures based on gene numbers
 				protLenSp = familiesDictLengths[family][species] #if calculating measures using protein length instead of gene numbers, without changing all variable names..
 				protLenPerSp[species] = familiesDictLengths[family][species] #redundant but needed #only initializes species if any gene

 				#3.7) Get genes per clade
 				clade = cladePerSpecies[species]
 				if clade not in protLenPerClade:
 					protLenPerClade[clade] = []
 				protLenPerClade[clade].append(protLenSp)

 				#3.8) get measure of species with most genes in family
 				if protLenSp > maxSpProtLen:
 					maxSpProtLen = protLenSp
 					maxSp = species

 				# #testing: block start. Uncomment this when running for real

 				#3.9) get interproscan data
 				for gene in familiesDict[family][species]: 
 					if gene in interproscanData[species]: #it is normal that a gene may not have any interproscan entries.
 						#Validation. Gene present in IPR file and with correct description
 						#Python: syphacia_muris SMUV_0000960301 {'Pfam': ['SMUV_0000960301-mRNA-1\tPfam\tprotein_match\t71\t295\t4.6E-58\t+\t.\tName=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"']}
 						#Farm: /lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep SMUV_0000960301 SMUV.protein.fa.gz.fas.ipr 
 						#	SMUV_0000960301-mRNA-1	Pfam	protein_match	71	295	4.6E-58	+	.	Name=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"

 						typesOfEntries = list(interproscanData[species][gene].keys()) #this is dictionary entries, therefore no duplications
 						assert (len(typesOfEntries) <= len(self.targetSources))

 						for entryType in typesOfEntries:
 							if "Pfam" in entryType:
 								nGenesWithPfam+=1

 								spl = "".join(interproscanData[species][gene][entryType]).split(";") #there may be several lines of pfam domains for same gene 
 								#Note: the presence of pfam entries for the same gene in the IPR GFF is random, (i.e. the last entry is not always the most N-terminal domain etc, its just random), 
 								#Note(cont): meaning that picking only 1 pfam example for each gene is randomized (but the same everytime we run this)

 								##originally I was picking only one pfam domain per gene								
 								desc = ""
 								pfamTermsSet = set()
 								for item in spl:
 									if item.startswith("signature_desc="):
 										desc = item.replace("signature_desc=","")
 										pfamTermsSet.add(desc)
 								for term in pfamTermsSet:
 									pfamTerms.append(term)

 								##then I also added column where we have all pfam domains in gene
 								for item in spl:
 									desc = ""
 									if item.startswith("signature_desc="):
 										desc = item.replace("signature_desc=","")
 										pfamTermsAll.append(desc)

 							if "SignalP" in entryType: #this can in principle match any SignalP_* entry
 								nGenesWithSignalP+=1
 							if "TMHMM" in entryType:
 								nGenesWithTMHMM+=1
 					# else: #dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep HNAJ_0000929301 HNAJ.protein.fa.gz.fas.fas.sl.ipr
 					# 	print (species,gene)

 					#testing: block end. Uncomment this when running for real


 			mostFrequentSpecies = str(maxSpProtLen)+"X:"+maxSp #species with most genes/prot length, and how many

 			#3.10) calculations on interproscan data #note this really looks at genes, not at protein length. Stats are gene-based, not altered for protein length.
 			assert (nGenesWithPfam <= nGenes)
 			assert (nGenesWithTMHMM <= nGenes)
 			assert (nGenesWithSignalP <= nGenes)

 			percPfam = "%.1f" % (nGenesWithPfam*100.0/nGenes)
 			percSignalP = "%.1f" % (nGenesWithSignalP*100.0/nGenes)
 			percTMHMM = "%.1f" % (nGenesWithTMHMM*100.0/nGenes)

 			##Getting 3 most common pfam terms, and % genes in family with that pfam
 			pfams = {}
 			pfamText = ""
 			for term in pfamTerms:
 				if term not in pfams:
 					pfams[term] = float("%.1f" % (pfamTerms.count(term)*100.0/nGenes ) )

 			pfamsSorted = sorted(pfams, key=pfams.get, reverse=True) #need numeric sorting

 			for term in pfamsSorted[0:3]: #top 3 pfam domains #if less than 3 exist will simply pick less
 				pfamText+=str(pfams[term])+"%:"+term+";"

 			if pfamText == "":
 				pfamText = "100%:None"
 	
 			##Getting all pfam domains in family statistics, and how many times
 			pfamsAll = {}
 			pfamTextAll = ""
 			for term in pfamTermsAll:
 				if term not in pfamsAll:
 					pfamsAll[term] = pfamTermsAll.count(term)

 			pfamsSortedAll = sorted(pfamsAll, key=pfamsAll.get, reverse=True)

 			for term in pfamsSortedAll:
 				pfamTextAll+=str(pfamsAll[term])+"X:"+term+";"

 			if pfamTextAll == "":
 				pfamTextAll = "0X:None"


 			#3.11) calculate ranking measures
 			
 			### IMPORTANT NOTE ### 14-Sep: Even though variable names and comments still say "genes", it may actually be using protein lengths. 

 			#Note that all these measures count zeroes.
 			#E.g. I know the species tree node corresponding to root of the gene tree. Therefore I always take into account all the species AND clades that SHOULD be present in the calculation.

 			##3.11.1) Calculate Z-score for each clade. Z-score = (mean prot length per species in a clade - mean prot length per species)/(std of prot length per species) 
 			#This only works if more than 1 clade. Otherwise, all entries have 'NA'
 			#I always check root and calculate for each clade descendant of the root. Other clades will have 'NA'.
 			#note that I always divide by all species in the clade, regardless all of them have a gene or not
 			#note: standard deviation can be zero (e.g. all species having same number prot length), and we cannot divide by zero. I tag these cases with "NA_std_is_zero"

 			#get clades that should be present.
 			if branchName in nodeInfo: #if not, it means it is a leaf, not an internal node
 				descendantClades = nodeInfo[branchName]
 			else:
 				assert("_" in branchName)
 				descendantClades = []

 			#calcualte mean genes per clade #adding zeroes if species does not have gene
 			meanProtLenPerClade = {} #key -> clade, value -> mean prot length per species in that clade (total, including zeroes) #note this dict only contains clade entries if they should have any value
 			sdProtLenOutsideClade = {} #key -> clade, value -> std dev of prot length per species that are *not* in this clade. Added Avril Coghlan 14-Jan-2016
 			for clade in descendantClades:
 				if clade in protLenPerClade:
 					meanProtLenPerClade[clade] = sum(protLenPerClade[clade]) / float(len(speciesPerClade[clade] ) )

                                        # calculate the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016
 					otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes. Added Avril Coghlan 14-Jan-2016
 					for species in nodeInfoSpp[branchName]: #species under the tree root. Added Avril Coghlan 14-Jan-2016.
 						if cladePerSpecies[species] == clade: # Added Avril Coghlan 14-Jan-2016
 							continue # Added Avril Coghlan 14-Jan-2016
 						else: # Added Avril Coghlan 14-Jan-2016
 							if species in protLenPerSp: #protLenPerSp only contains species entry if any gene. Added Avril Coghlan 14-Jan-2016.
 								otherSpecies.append(protLenPerSp[species]) # Added Avril Coghlan 14-Jan-2016.
 							else: # Added Avril Coghlan 14-Jan-2016.
 								otherSpecies.append(0) # Added Avril Coghlan 14-Jan-2016.
 					stdProteinLengthsSpeciesZeroesOutsideClade = std(otherSpecies, ddof=1) # Added Avril Coghlan 14-Jan-2016. Used ddof=1 as Diogo has used this elsewhere for std. dev.
 					sdProtLenOutsideClade[clade] = stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016.

 				else: #if no species have it
 					meanProtLenPerClade[clade] = 0
 					sdProtLenOutsideClade[clade] = 0 # Added by bh4 - 17-Jan-2016
 			assert(set(meanProtLenPerClade.keys()) == set(descendantClades) )

 			#actually calculate z-scores
 			zscoreResults = {} #key -> clade, val -> zscore
 			maxZscore = 0 #will only count positive Zscores
 			maxZclade = "NA"
 			for clade in sorted(speciesPerClade): #just so I loop over all clades
 				if len(descendantClades) < 2: #cannot calculate clade Z-score if only one clade
 					zscoreResults[clade] = "NA"
 				else:
 					if clade in meanProtLenPerClade:
 						# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
 						stdProteinLengthsSpeciesZeroesOutsideClade = sdProtLenOutsideClade[clade] # Added Avril Coghlan 14-Jan-2016.
 						# if stdnGenesPerSpeciesZeroes == 0: Commented out Avril Coghlan 14-Jan-2016
 						if stdProteinLengthsSpeciesZeroesOutsideClade == 0: # Added Avril Coghlan 14-Jan-2016
 							zscoreResults[clade] = "NA_std_is_zero"
 						else:
 							# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016.
 							zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroesOutsideClade #       Added Avril Coghlan 14-Jan-2016
 							# zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroes. Commented      out Avril Coghlan 14-Jan-2016
 							#validated using http://ncalculators.com/statistics/z-score-calculator.htm
 							zscoreResults[clade] = zscore
 							if zscore > maxZscore:
 								maxZscore = zscore
 								maxZclade = clade
 					else:
 						zscoreResults[clade] = "NA"

 			maxZscore = "%.2f" % maxZscore

 			#Z-score calculation validation:
 			# ##Mono-clade family
 			# family 1212416 : NECAME_04199 (necator_americanus) NBR_0000394301 (nippostrongylus_brasiliensis) NBR_0001482701 (nippostrongylus_brasiliensis) NBR_0002215501 (nippostrongylus_brasiliensis)
 			# 1212416 0.5     0.5     NA      NA      NA      NA      NA      NA      NA      0.0
 			# #CORRECT

 			# ##Two-neighbour-clade family
 			# family 851973 : csin104847 (clonorchis_sinensis) csin112867 (clonorchis_sinensis) D915_09852 (fasciola_hepatica) D915_09853 (fasciola_hepatica) SSLN_0001350701 (schistocephalus_solidus) Sjp_0074790 (schistosoma_japonicum) Sjp_0074800 (schistosoma_japonicum) Smp_166880 (schistosoma_mansoni)
 			# Em	Ts	Hm	Mc	Ss	Cs	Fh	Sj	Sm
 			# 0	0	0	0	1	2	2	2	1
 			# avg	0.888888889		zscore tapeworm	-0.787400787				
 			# std	0.874889764		zscore fluke	0.984250984				
 			# tapeworm avg	0.2							
 			# fluke avg	1.75							
 			# #VALUES VALIDATED, MANUAL CALCULATION!

 			# ##Root with 6 clades, but genes in only 3 clades.
 			# descendants: {'IV', 'V', 'Tapeworm', 'III', 'I', 'Flukes'} data keys: ['I', 'IV', 'Flukes']
 			# family 919890 : csin102733 (clonorchis_sinensis) D915_07255 (fasciola_hepatica) MhA1_Contig762.frz3.gene6 (meloidogyne_hapla) nRc.2.0.1.g27258 (romanomermis_culicivorax) Sjp_0091230 (schistosoma_japonicum) Smp_090010 (schistosoma_mansoni)
 			# #VALUES VALIDADED, MANUAL CALCULATION!


 			##3.11.2) Calculate clade gene enrichement (Avril's/Bhavana's measure)
 			# mean prot length per species in a clade/mean prot length per species in all other species
 			#Note that I use root to see which clades could be present, and I count zeroes on clades and species. When I say "all other species" it consists of the species that could have a gene based on root.

 			# enrichResults = {} #key -> clade, val -> zscore
 			# maxEnrich = 0 #will only count positive Zscores
 			# maxEnrichClade = "NA"
 			# for clade in sorted(speciesPerClade): #just so I loop over all clades
 			# 	if len(descendantClades) < 2: #cannot calculate if only one clade
 			# 		enrichResults[clade] = "NA"
 			# 	else:
 			# 		if clade in meanProtLenPerClade: #clades that should have genes (but may not have)
 			# 			otherSpecies = [] #list of number of genes for the species not in current clade, will include zeroes
 			# 			for cladeDesc in descendantClades: #loop over all clades that should/could contain a gene based on root, not just the ones that do have a gene
 			# 				if cladeDesc == clade: #if current clade
 			# 					continue
 			# 				if cladeDesc in protLenPerClade: #this means that there will be at least one species with a gene here
 			# 					otherSpecies.extend(protLenPerClade[cladeDesc]) 
 			# 					missingZeroes = speciesPerClade[cladeDesc] - len(protLenPerClade[cladeDesc]) #the protLenPerClade only contains data on species with genes, therefore need to add zeroes 
 			# 					for m in range(missingZeroes):
 			# 						otherSpecies.append(0)
 			# 				else: #add zeroes accordingly
 			# 					l = [0]*speciesPerClade[cladeDesc]
 			# 					otherSpecies.extend(l)

 			# 			#print (family, branchName, clade, len(otherSpecies),speciesPerClade[clade],nSpeciesBelowTaxon)
 			# 			assert (len(otherSpecies) + speciesPerClade[clade] == nSpeciesBelowTaxon)

 			# 			enrich = meanProtLenPerClade[clade] / mean(otherSpecies) #meannGenesPerSpeciesZeroes #mean(otherSpecies)
 			# 			enrichResults[clade] = enrich
 			# 			if enrich > maxEnrich:
 			# 				maxEnrich = enrich
 			# 				maxEnrichClade = clade
 			# 		else: #so that I write NA in clades which should not have genes in this family
 			# 			enrichResults[clade] = "NA"

 			# assert (len(enrichResults) == len(speciesPerClade))
 			# maxEnrich = "%.2f" % maxEnrich

 			# ##Another approach to calculate clade enrichment, allowing paraphyletic groups.
 			#Note that if gene tree root is in between a paraphyletic group, the score is calculated for all members of the paraphyletic clade.

 			enrichResults = {} #key -> clade, val -> zscore
 			maxEnrich = 0 #max will only count positive Zscores
 			maxEnrichClade = "NA"
 			for clade in sorted(speciesPerClade): #loop over all clades, regardless of having gene or not, to fill all columns in output
 				if len(descendantClades) < 2: #cannot calculate if only one clade
 					enrichResults[clade] = "NA"
 				else: #calculate enrichment
 					if clade in meanProtLenPerClade: #clades that should have genes (but may not have) #note that meanProtLenPerClade already accounts for zeroes
 						otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes
 						
 						countSppInClade = 0 #just for validation #because of paraphyletic groups, the number of species in a clade may be different from number of species below gene tree root.
 						for species in nodeInfoSpp[branchName]: #species under the tree root
 							if cladePerSpecies[species] == clade:
 								countSppInClade+=1
 								continue
 							else:
 								if species in protLenPerSp: #protLenPerSp only contains species entry if any gene
 									otherSpecies.append(protLenPerSp[species])
 								else:
 									otherSpecies.append(0)
 						
 						assert (len(otherSpecies) + countSppInClade == nSpeciesBelowTaxon)

 						enrich = meanProtLenPerClade[clade] / mean(otherSpecies)
 						enrichResults[clade] = enrich
 						if enrich > maxEnrich:
 							maxEnrich = enrich
 							maxEnrichClade = clade
 					else: #so that I write NA in clades which do not have genes in this family
 						enrichResults[clade] = "NA"

 			assert (len(enrichResults) == len(speciesPerClade))
 			maxEnrich = "%.2f" % maxEnrich

 			##Confirmed that this approach gives the same results as the previous, if using same dataset.


 			#4) Writing to output file, one family at a time
 			#main output file
 			self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % 	(family,familyFlag,nSpecies,nGenes,avgGenesPerSpecies,medianGenesPerSpecies,stdevDivMean,nParalogs,branchName,completeness,duplications,maxDuplicationNode,mostFrequentSpecies) )
 			##Presence in free-living species 
 			for species in sorted(freeLivingSpecies):
 				if species in familiesDict[family]:
 					self.outputFile.write("%s\t" % len(familiesDict[family][species]))
 				else:
 					self.outputFile.write("0\t")
 			self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (losses,medianBranchLength,percPfam,percTMHMM,percSignalP,pfamText,pfamTextAll) )

 			#measures file
 			outputFileMeasures.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgProteinLenPerGene,sumProteinLengths,stdevDivMean,speciesVarCoefZeroes,speciesVarCoefZeroesLengths) )
 			for clade in sorted(speciesPerClade):
 				outputFileMeasures.write("%s\t" % (zscoreResults[clade]) )
 			outputFileMeasures.write("%s\t%s\t" % (maxZclade,maxZscore) )
 			for clade in sorted(speciesPerClade):
 				outputFileMeasures.write("%s\t" % (enrichResults[clade]) )
 			outputFileMeasures.write("%s\t%s\t" % (maxEnrichClade,maxEnrich) )
 			outputFileMeasures.write("%s\t%s\n" % (pfamText,pfamTextAll) )


 		self.outputFile.close()
 		outputFileMeasures.close()

 		if self.verbose: print ("Total number of families processed:\t%s" % count )


 	def writeFamilies(self):
 		"""(Optional function) Writes the families (like in families.txt file) after all the filterings, the way they are actually processed.
 		On the original files there is no apparent gene order sorting. Here there will be species sorting.
 		Intended output example
 		-------------------------------------------------------------------------------
 		family 1584328 : ASU_08438 (ascaris_suum) ALUE_0001329201 (ascaris_lumbricoides)
 		-------------------------------------------------------------------------------
 		family 1584332 : nRc.2.0.1.g04834 (romanomermis_culicivorax) nRc.2.0.1.g36069 (romanomermis_culicivorax)
 		"""

 		if self.verbose: print ("Writing new families.txt file..")

 		familiesDict = self.familiesDict

 		dumpFamiliesFile = open(self.dumpFamiliesFile,"w")

 		count = 0
 		for family in sorted(familiesDict):
 			count+=1
 			text = "-------------------------------------------------------------------------------\n"
 			text+= "family %s :" % family

 			for species in sorted(familiesDict[family]):
 				for gene in sorted(familiesDict[family][species]):
 					text+=" %s (%s)" % (gene,species)

 			text+="\n"

 			dumpFamiliesFile.write(text)

 		dumpFamiliesFile.close()

 		if self.verbose: print ("Wrote %s families at %s" % (count,dumpFamiliesFile.name))


 	def cleanUp(self):
 		"""removes temporary files"""
 		#pass
 		os.system("rm tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt")


 	def dumpAll(self):

 		if self.verbose: print ("Writing pickle dump files..")

 		pickle.dump(self.familiesDict,open(self.dumpsFolder+"/familiesDict","wb") )
 		pickle.dump(self.familiesTaxon,open(self.dumpsFolder+"/familiesTaxon","wb") )
 		pickle.dump(self.numTaxonBelow,open(self.dumpsFolder+"/numTaxonBelow","wb") )
 		pickle.dump(self.familiesGeneDuplication,open(self.dumpsFolder+"/familiesGeneDuplication","wb") )
 		pickle.dump(self.familiesGeneDuplicationMaxNode,open(self.dumpsFolder+"/familiesGeneDuplicationMaxNode","wb") )
 		pickle.dump(self.familiesGeneLosses,open(self.dumpsFolder+"/familiesGeneLosses","wb") )
 		pickle.dump(self.tagSpeciesName,open(self.dumpsFolder+"/tagSpeciesName","wb") )
 		pickle.dump(self.geneIDMapping,open(self.dumpsFolder+"/geneIDMapping","wb") )
 		pickle.dump(self.interproscanData,open(self.dumpsFolder+"/interproscanData","wb") )
 		pickle.dump(self.branchLengths,open(self.dumpsFolder+"/branchLengths","wb") )
 		pickle.dump(self.cladePerSpecies,open(self.dumpsFolder+"/cladePerSpecies","wb") )
 		pickle.dump(self.speciesPerClade,open(self.dumpsFolder+"/speciesPerClade","wb") )
 		pickle.dump(self.speciesTree,open(self.dumpsFolder+"/speciesTree","wb") )
 		pickle.dump(self.removeOutgroups,open(self.dumpsFolder+"/removeOutgroups","wb") )
 		pickle.dump(self.speciesToRemove,open(self.dumpsFolder+"/speciesToRemove","wb") )
 		pickle.dump(self.familiesToFlag,open(self.dumpsFolder+"/familiesToFlag","wb") )
 		pickle.dump(self.listOfSpecies,open(self.dumpsFolder+"/listOfSpecies","wb") )
 		pickle.dump(self.geneIDMappingAllEntries,open(self.dumpsFolder+"/geneIDMappingAllEntries","wb") )
 		pickle.dump(self.geneFilterList,open(self.dumpsFolder+"/geneFilterList","wb") )
 		pickle.dump(self.nodeInfo,open(self.dumpsFolder+"/nodeInfo","wb") )
 		pickle.dump(self.nodeInfoSpp,open(self.dumpsFolder+"/nodeInfoSpp","wb") )
 		pickle.dump(self.proteinLengths,open(self.dumpsFolder+"/proteinLengths","wb")  )
 		pickle.dump(self.familiesDictLengths,open(self.dumpsFolder+"/familiesDictLengths","wb") )


        # for key in vars(self):
        #     pickle.dump(eval("self."+key),open("dumps/"+str(key),"wb") )


 	def loadAll(self,dumpPath):

 		if self.verbose: print ("Loading pickle dump files..")

 		#loading only the required data
 		self.familiesDict=pickle.load(open(dumpPath+"/familiesDict","rb") )
 		self.familiesTaxon=pickle.load(open(dumpPath+"/familiesTaxon","rb") )
 		self.numTaxonBelow=pickle.load(open(dumpPath+"/numTaxonBelow","rb") )
 		self.familiesGeneDuplication=pickle.load(open(dumpPath+"/familiesGeneDuplication","rb") )
 		self.familiesGeneDuplicationMaxNode=pickle.load(open(dumpPath+"/familiesGeneDuplicationMaxNode","rb") )
 		self.familiesGeneLosses=pickle.load(open(dumpPath+"/familiesGeneLosses","rb") )
 		self.interproscanData=pickle.load(open(dumpPath+"/interproscanData","rb") ) #can comment this to run faster, uncomment when running for real
 		self.branchLengths=pickle.load(open(dumpPath+"/branchLengths","rb") )
 		self.cladePerSpecies=pickle.load(open(dumpPath+"/cladePerSpecies","rb") )
 		self.speciesPerClade=pickle.load(open(dumpPath+"/speciesPerClade","rb") )
 		self.familiesToFlag=pickle.load(open(dumpPath+"/familiesToFlag","rb") )
 		self.listOfSpecies=pickle.load(open(dumpPath+"/listOfSpecies","rb") )
 		self.nodeInfo=pickle.load(open(dumpPath+"/nodeInfo","rb") )
 		self.nodeInfoSpp=pickle.load(open(dumpPath+"/nodeInfoSpp","rb") )
 		self.familiesDictLengths=pickle.load(open(dumpPath+"/familiesDictLengths","rb") )


 	def run(self):
 		"""Runs the functions in order."""

 		self.readGeneIDMappingFolder() #this has to be run before the readGeneFilterList()
 		if self.removeOutgroupsFile != "": self.readOutgroupList() #this has to be run before readFamilies()
 		if self.geneFilterListFile != "": self.readGeneFilterList() #this has to be run before readFamilies()
 		self.readProteinLengthFiles() #this has to be run before readFamilies
 		self.readFamilies()
 		self.readSpeciesTree()
 		if self.removeOutgroupsFile != "": self.writeFamiliesPerTaxonomyLevel() #this has to be run after readFamilies()
 		if self.flagFamilyListFile != "": self.readFlagFamilyList() #this can be run anywhere before statsPerFamily()
 		self.readFamiliesPerTaxonomyLevel() #note that this will read a file created in this script
 		self.readGeneDuplicationsFolder()
 		self.readGeneLossesFolder()
 		self.readSpeciesNameTag()
 		self.readInterproGFFFolder()
 		self.readBranchLengths()
 		self.readCladeFile()
 		self.readNodeInfoFile() #has to be run after readCladeFile
 		if self.dumpsFolder != "": self.dumpAll()
 		self.statsPerFamily()
 		if self.dumpFamiliesFile != "": self.writeFamilies()
 		self.cleanUp()


 	def runLoad(self,dumpPath):
 		"""For script development. To use if just changing the statsPerFamily function."""

 		if self.verbose: print ("Loading dumps..")

 		self.loadAll(dumpPath) #this loads are required objects to runs stats on them

 		self.statsPerFamily()

 		if self.dumpFamiliesFile != "": self.writeFamilies()


 def main():

 	import argparse

 	parser = argparse.ArgumentParser(description='Analysis suite for exploring Compara families.')

 	#positional args
 	parser.add_argument('comparaFamilies', metavar='comparaFamilies', type=str, help='Flat file. Compara families file. One family per line, list of genes/species. Output from get_all_50HG_families_v75.pl.')
 	parser.add_argument('familiesPerTaxonomyLevel', metavar='familiesPerTaxonomyLevel', type=str, help='Flat file. One taxonomy level per line, list of families. Output from get_families_per_taxonomy_level_v75.pl.')
 	parser.add_argument('speciesTree', metavar='speciesTree', type=str, help='Species tree in newick format. Each node should have an unique name, this expects species/node+id (e.g. dorylaimia40001065, romanomermis_culicivorax40001064). Output from get_species_tree_v75.pl.')
 	parser.add_argument('geneDuplicationsFolder', metavar='geneDuplicationsFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_duplications_per_taxonomy_level_v75.pl.')
 	parser.add_argument('geneLossesFolder', metavar='geneLossesFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
 	parser.add_argument('speciesNameTag', metavar='speciesNameTag', type=str, help='TSV file. Locus tag - Species name correspondence. PREFIX  Genus   species VERSION.')
 	parser.add_argument('geneIDMappingFolder', metavar='geneIDMappingFolder', type=str, help='Folder with a *_id_mapping.txt TSV file for each species, with Gene IDs from different sources. SPECIES_NAME    GENEMEMBER_ID   SEQMEMBER_ID    TRANSCRIPT_ID   PROTEIN_ID      IPRSCAN_GFF_ID.')
 	parser.add_argument('interproscanGFFFolder', metavar='interproscanGFFFolder', type=str, help='Folder with a *ipr* GFF file for each species. Files can be gzipped. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.')
 	parser.add_argument('branchLenghts', metavar='branchLenghts', type=str, help='TSV file. One line per Compara family, ID in first column, average branch length of tree in second column, median in third column. Output from get_compara_branch_length.py.')
 	parser.add_argument('speciesClade', metavar='speciesClade', type=str, help='TSV file. One line per species, first column species name as in comparaFamilies, second column the clade name/number, third column either Parasitic or Freeliving. E.g. loa_loa\tIII\tParasitic')
 	parser.add_argument('nodeInfo', metavar='nodeInfo', type=str, help='File with leaves per each node in the tree. Works even if whole species tree was used (no need to prune). Produced by ~alc/Documents/git/Python/parse_tree_with_ETE.py')
 	parser.add_argument('proteinLengthsFolder', metavar='proteinLengthsFolder', type=str, help='Path to folder with a file for each species. Files need to have same names as species names and contain a list of all proteins and their respective length, in amino acids. Used also to recreate singletons lists. From get_protein_lengths_all_species_v75.pl.')

 	#optional argument
 	parser.add_argument('--outputFile', metavar='outputFile', default = os.getcwd()+"/comparaFamiliesParser.out", type=str, help='File where important output will be written (but you should also store stdout and stderr). If none given it will created file called comparaFamiliesParser.out in cwd.')
 	parser.add_argument('--verbose', metavar='verbose', default = 1, type=int, help='Verbose mode (1 = Yes, 0 = No). Default = 1.')
 	parser.add_argument('--removeOutgroups', metavar='removeOutgroups', default = "", type=str, help='Whether to remove outgropup species from results. Give file with list of species/tree nodes to remove/ignore from analysis, one per line. First column is nodeID (e.g. 400010010), second column is the node/species name, third column whether this is "species" or "node". Remember this must include also internal nodes. Default = "" (off)')
 	parser.add_argument('--dumpFamilies', metavar='dumpFamilies', default = "", type=str, help='Option to create output file with the post-filtered Compara families used here. Introduce wanted file name after argument. Default = No')
 	parser.add_argument('--flagFamilyList', metavar='flagFamilyList', default = "", type=str, help='Option to flag with "exclude" specific families provided in a flat file, with one family ID per line. Default = No')
 	parser.add_argument('--geneFilterList', metavar='geneFilterList', default = "", type=str, help='Option to exclude the listed genes from all output measures. Use mRNA IDs (SEQMEMBER_ID), which may be different from families file. Default = No')
 	parser.add_argument('--loadDumps', metavar='loadDumps', default = "", type=str, help='Instead of running the script from scratch, load already dumped python objects. Provide dumps folder path. Default = No')
 	parser.add_argument('--dumpsFolder', metavar='dumpsFolder', default = "dumps", type=str, help='Give folder path to create object dumps. Default = create dumps in folder called "dumps".')

 	args = parser.parse_args() #gets the arguments
 	print (args)

 	#any better way to pass arguments? possible with *args or **kwargs? couldn't make it work using eval
 	run = Run(args.comparaFamilies,args.familiesPerTaxonomyLevel,args.geneDuplicationsFolder,args.geneLossesFolder,args.speciesNameTag,args.geneIDMappingFolder,\
 		args.interproscanGFFFolder,args.branchLenghts,args.removeOutgroups,args.speciesTree,args.verbose,args.outputFile,args.dumpFamilies,args.flagFamilyList,\
 		args.speciesClade,args.geneFilterList,args.dumpsFolder,args.nodeInfo,args.proteinLengthsFolder)

 	if args.loadDumps != "":
 		#to speed up processing, just load data and run stats
 		run.runLoad(args.loadDumps)
 	else:
 		#create dump folder, write parameter file and run all
 		if not os.path.exists(args.dumpsFolder):
 			os.mkdir(args.dumpsFolder)
 		with open(args.dumpsFolder+"/parameters.txt","w") as parametersFile:
 			parametersFile.write(str(args) )
 		run.run()


 	print ("Finished in %.1f seconds." % (clock()-time0))


 if __name__ == '__main__':
 	main()