Created
April 15, 2018 10:14
-
-
Save avrilcoghlan/83acefb2af564a11d22b4bbc9e0aac18 to your computer and use it in GitHub Desktop.
Script written by Diogo Ribeiro to identify gene family expansions in an in-house Compara database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#25-Feb-2015 dr7 | |
#Analysis suite for exploring Compara families. The purpose is to gather information about Compara trees to that we can mine this large set of information and select the most interesting families to study. | |
import sys | |
import os | |
import re | |
import gzip | |
import random | |
import pickle | |
from numpy import mean, median, std #numpy.std gets the population standard deviation, not the sample standard deviation | |
from glob import glob | |
from pprint import pprint | |
from time import clock | |
from warnings import warn | |
#dependency | |
import findLastCommonAncestor #dr7 deployed and tested code | |
from Bio import Phylo | |
time0 = clock() | |
class Run(object): | |
def __init__(self, comparaFamilies,familiesPerTaxonomyLevel,geneDuplicationsFolder,geneLossesFolder,speciesNameTag,geneIDMappingFolder, \ | |
interproscanGFFFolder,branchLengths,removeOutgroupsFile,speciesTreeFile,verbose,outputFile,dumpFamilies,flagFamilyList,speciesClade, \ | |
geneFilterList,dumpsFolder,nodeInfo,proteinLengthsFolder): | |
"""Initialization of all class variables. Explains data structures.""" | |
#inputs / arguments | |
self.comparaFamiliesFile = open(comparaFamilies,"r") | |
self.familiesPerTaxonomyLevelFile = familiesPerTaxonomyLevel | |
self.speciesTreeFile = speciesTreeFile | |
self.geneDuplicationsFolder = geneDuplicationsFolder | |
self.geneLossesFolder = geneLossesFolder | |
self.speciesNameTagFile = open(speciesNameTag,"r") | |
self.geneIDMappingFolder = geneIDMappingFolder | |
self.interproscanGFFFolder = interproscanGFFFolder | |
self.branchLengthsFile = open(branchLengths,"r") | |
self.cladeFile = open(speciesClade,"r") | |
self.nodeInfoFile = open(nodeInfo,"r") | |
#optional inputs / arguments | |
self.verbose = verbose | |
self.outputFile = open(outputFile,"w") | |
self.removeOutgroupsFile = removeOutgroupsFile | |
self.dumpFamiliesFile = dumpFamilies | |
self.flagFamilyListFile = flagFamilyList | |
self.geneFilterListFile = geneFilterList | |
self.dumpsFolder = dumpsFolder | |
self.proteinLengthsFolder = proteinLengthsFolder | |
#hard-coded 'arguments' | |
self.jobID = str(random.randint(1,100000)) | |
self.targetSources = ["Pfam","TMHMM","SignalP_EUK"] #the types of GFF source entries that we are interested in storing in memory. #this is kept here as it may be modified, but not by user. | |
self.freeLivingSpecies = ["caenorhabditis_elegans","panagrellus_redivivus","rhabditophanes_kr3021","pristionchus_pacificus","schmidtea_mediterranea"] # 50HG Species that are completely free-living, so that I can distinguish parasitic-specific gene families. #This is just for helminth species, other non-helminth outgroups would be free-living too. | |
#data structures #main connector is familyID or species | |
self.familiesDict = {} #key -> familyID, val -> dict: key -> species, val -> list of genes | |
self.familiesTaxon = {} #key -> familyID, val -> taxonomy level | |
self.numTaxonBelow = {} #key -> taxon ID, val -> number species below | |
self.familiesGeneDuplication = {} #key -> familyID, val -> sum of duplications on throughout all species nodes | |
self.familiesGeneDuplicationMaxNode = {} #key -> familyID, val -> node ID with most duplication events | |
self.familiesGeneLosses = {} #key -> familyID, val -> sum of losses on throughout all species nodes | |
self.tagSpeciesName = {} #key -> species_tag, val -> species_name #this is to connect species to interproscan files | |
self.geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt | |
self.geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs | |
self.interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source | |
self.branchLengths = {} #key -> familyID, val -> average branch length in family | |
self.cladePerSpecies = {} #key -> species, val -> clade | |
self.speciesPerClade = {} #key -> clade, value -> number of species in the clade (based on the species tree) | |
self.nodeInfo = {} #key -> node ID, list of clades potentially present under that node. e.g. 'bilateria40001012': {'I', 'IV', 'Flatworm', 'Flukes', 'Tapeworm', 'V', 'III'} | |
self.nodeInfoSpp = {} #key -> node ID, list of species present under that node. | |
self.proteinLengths = {} #key -> species, val -> dict ; key -> gene, val -> protein length in aa | |
self.familiesDictLengths = {} #key -> familyID, val -> dict: key -> species, val -> sum of protein length of proteins in that family and species. | |
#optional data structures | |
self.speciesTree = {} #key -> parent, value -> child. Done from a newick tree with external script "findLastCommonAncestor" | |
self.removeOutgroups = [] #list of node/leaf ids + species/node-name to remove from analysis | |
self.speciesToRemove = [] #list of species to remove from analysis #to apply when processing families.txt | |
self.familiesToFlag = [] #list of family IDs of families to have 'exclude' flag | |
self.geneFilterList = [] #list of gene IDs to filter out | |
self.listOfSpecies = [] #list of species we process here #based on the species tree! | |
##Notes: | |
#The species name should be lower case and with underscore e.g. panagrellus_redivivus | |
#The family IDs should be an int | |
#geneFilterListFile: added gene filtering into the readFamily step. Most measures will be (re)calculated in this script itself (e.g. gene tree root is recalculated). Note though that the following measures won't be correct/updated if gene filtering: Total_duplications, Total_losses, Max_duplications_node, Median_branch_length. These are external measures and calculated only before the gene filtering. | |
def readFamilies(self): | |
"""Parse compara families file. This file can contain all species and genes (i.e. can be Compara families raw file). | |
Also does the species filtering (i.e. only counts genes in species that are not listed to remove). | |
Also does gene filtering, if given a list of genes to discard. | |
Only retains families that have at least 2 genes after filtering. | |
Also uses protein lengths instead of number of genes. instead of counting number of genes it sums up protein length of gene. (e.g. a species, instead of having 10 genes in that family has 3000 aa). | |
E.g. input file line: family 3 : SSTP_0000649300.1 (strongyloides_stercoralis) DME_0000314201-mRNA-1 (dracunculus_medinensis) TELCIR_18205 (teladorsagia_circumcincta) HPBE_0001583601-mRNA-1 (heligmosomoides_bakeri) """ | |
if self.verbose: print ("Reading compara families files..") | |
comparaFamiliesFile = self.comparaFamiliesFile | |
speciesToRemove = self.speciesToRemove | |
geneFilterList = self.geneFilterList | |
proteinLengths = self.proteinLengths | |
familiesDict = {} | |
familiesDictLengths = {} | |
for line in comparaFamiliesFile: #for each family | |
if "----------" not in line: | |
line = line.strip() | |
spl = line.split(":",1) #doing only max of one split, this is because some IDs also contain ":" | |
assert (len(spl) == 2) | |
familyID = int(spl[0].split(" ")[1]) | |
speciesDict = {} #key -> sp, val -> list of genes | |
speciesDictLengths = {} #same as previous but stores cumulative lengths of genes of a species as an integer | |
genes = spl[1].split(" ")[1:] | |
for i in range(0,len(genes),2): | |
gene,sp = genes[i],genes[i+1] | |
if gene not in geneFilterList: #otherwise it is ignored, as if not existing. After this there should be a condition excluding potentially empty families. | |
sp = sp.replace("(","").replace(")","") | |
if sp not in speciesToRemove: #validated that this works well by looking at a couple of families | |
if sp not in speciesDict: | |
speciesDict[sp] = [] | |
speciesDictLengths[sp] = 0 | |
speciesDict[sp].append(gene) | |
#for protein lengths instead of gene numbers | |
proteinLength = int(proteinLengths[sp][gene] ) | |
speciesDictLengths[sp]+= proteinLength | |
lens = [len(speciesDict[sp]) for sp in speciesDict] #numbers of genes | |
sumLens = sum(lens) | |
if len(speciesDict) > 0 and sumLens > 1: #and sum(list(speciesDict.values()) ) > 1: #there can be empty families, if all species of that tree are to be removed #also there could be single gene 'families' after filtering. | |
familiesDict[familyID] = speciesDict | |
familiesDictLengths[familyID] = speciesDictLengths | |
if self.verbose: print ("Total number compara families (from families.txt) to be processed, after filtering:\t%s" % len(familiesDict)) #this number does not have to match the total number of families, in case species are being removed | |
assert (len(familiesDict) == len(familiesDictLengths)) | |
self.familiesDict = familiesDict | |
self.familiesDictLengths = familiesDictLengths | |
def readFamiliesPerTaxonomyLevel(self): | |
"""Reads families per taxonomy level. Note that, if the families.txt is filterd in this script, the input for this function may be a temporary file (corrected for new families) created in this script itself. | |
E.g. Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819,1081886,1082962,1090840,1117204,1117804,1118812""" | |
if self.verbose: print ("Reading %s file.." % self.familiesPerTaxonomyLevelFile) | |
familiesPerTaxonomyLevelFile = open(self.familiesPerTaxonomyLevelFile,"r") | |
familiesTaxon = {} | |
for line in familiesPerTaxonomyLevelFile: | |
line = line.strip() | |
taxon,families = line.split(":") | |
taxon = taxon.lower().strip().split(" ") #may need lower() | |
taxonNum = taxon[-1] | |
taxon = "_".join(taxon[:-1])+taxonNum #e.g. Nippostrongylus_brasiliensis40001108 | |
families = families.strip().split(",") | |
for family in families: | |
family = int(family) | |
if family not in familiesTaxon: | |
familiesTaxon[family] = taxon | |
else: | |
print ("Two roots for family?",family) | |
if self.verbose: print ("Total families in families_per_taxonomy_level:\t%s" % len(familiesTaxon)) | |
self.familiesTaxon = familiesTaxon | |
def _readTaxonFiles(self,inFolder): | |
"""E.g filename: echinostomatoidea40001020.txt. Inside file: 66838 2 (familyID\tNum_Events). | |
What I want is to combine all events for each family.""" | |
removeOutgroups = self.removeOutgroups | |
outDict = {} #key -> family ID, val -> tot events | |
outDictNode = {} #key -> family ID, val -> dict; key -> nodeID, val -> tot events #to have node with most events | |
filesToProcess = glob(inFolder+"/*.txt") | |
for fi in filesToProcess: | |
fileName = fi.split("/")[-1].replace(".txt","") | |
if fileName in removeOutgroups: #exclude data from nodes/species that we want to exclude | |
continue | |
# #checking if taxon IDs match between files | |
# print (self.numTaxonBelow[fileName]) | |
f = open(fi,"r") | |
for line in f: | |
if "#" in line: | |
continue | |
line = line.strip() | |
spl = line.split("\t") | |
familyID = int(spl[0]) | |
events = int(spl[1]) | |
if familyID not in outDict: | |
outDict[familyID] = 0 | |
outDictNode[familyID] = {} | |
if fileName not in outDictNode[familyID]: | |
outDictNode[familyID][fileName] = 0 | |
outDict[familyID]+=events | |
outDictNode[familyID][fileName]+=events | |
f.close() | |
return outDict,outDictNode | |
def readGeneDuplicationsFolder(self): | |
"""Process all files in Gene Duplications folder""" | |
if self.verbose: print ("Reading gene duplication files..") | |
geneDuplicationsFolder = self.geneDuplicationsFolder | |
run = self._readTaxonFiles(geneDuplicationsFolder) | |
familiesGeneDuplication = run[0] | |
nodeDict = run[1] | |
#getting most frequent node | |
familiesGeneDuplicationMaxNode = {} | |
for familyID in nodeDict: | |
maxVal = 0 | |
maxNode = "" | |
if familyID not in familiesGeneDuplicationMaxNode: | |
familiesGeneDuplicationMaxNode[familyID] = "" | |
for node in nodeDict[familyID]: | |
val = nodeDict[familyID][node] | |
if val > maxVal: | |
maxVal = val | |
maxNode = node | |
familiesGeneDuplicationMaxNode[familyID] = str(maxVal)+"X:"+maxNode | |
##validation | |
# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ g 1178810 * | |
# crassostrea_gigas40001175.txt:1178810 1 | |
# schmidtea_mediterranea40001061.txt:1178810 2 | |
# Family 126577: '37X:schmidtea_mediterranea40001061' | |
# dr7@farm3-head2:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_duplications/cutoffm1$ grep 126577 * | |
# ... | |
# ascaridoidea40001165.txt:126577 4 | |
# bilateria40001006.txt:126577 12 | |
# ... | |
# schmidtea_mediterranea40001061.txt:126577 37 | |
# ... | |
if self.verbose: print ("Total families with gene duplication info:\t%s" % len(familiesGeneDuplication)) | |
self.familiesGeneDuplication = familiesGeneDuplication | |
self.familiesGeneDuplicationMaxNode = familiesGeneDuplicationMaxNode | |
def readGeneLossesFolder(self): | |
"""Process all files in Gene Losses folder""" | |
if self.verbose: print ("Reading gene loss files..") | |
geneLossesFolder = self.geneLossesFolder | |
familiesGeneLosses = self._readTaxonFiles(geneLossesFolder)[0] | |
#Validation: 1135828: 6 | |
# dr7@pcs5b:/nfs/helminths02/analysis/50HGP/00ANALYSES/final_gene_losses$ g 1135828 * | |
# chromadorea40001076.txt:1135828 1 | |
# platyhelminthes40001014.txt:1135828 1 | |
# romanomermis_culicivorax40001064.txt:1135828 1 | |
# soboliphyme_baturini40001066.txt:1135828 1 | |
# trichuris_muris40001075.txt:1135828 1 | |
# trichuris_trichiura40001074.txt:1135828 1 | |
if self.verbose: print ("Total families with gene losses info:\t%s" % len(familiesGeneLosses)) | |
self.familiesGeneLosses = familiesGeneLosses | |
def readSpeciesNameTag(self): | |
"""Reads tag-species_name correspondence. E.g. PREFIX Genus species VERSION | |
ACAC Angiostrongylus cantonensis 1.5.4""" | |
if self.verbose: print ("Reading species name mapping files..") | |
speciesNameTagFile = self.speciesNameTagFile | |
tagSpeciesName = {} | |
for line in speciesNameTagFile: | |
if line.startswith('"This') or line.startswith("PREFIX") or "#" in line: | |
continue | |
line = line.strip() | |
spl = line.split("\t") | |
tag = spl[0] | |
sp = spl[1].lower().strip()+"_"+spl[2].strip() #strip because sometimes there is some empty spaces | |
tagSpeciesName[tag] = sp | |
if self.verbose: print ("Total species with locus tag info:\t%s" % len(tagSpeciesName)) | |
self.tagSpeciesName = tagSpeciesName | |
def readGeneIDMappingFolder(self): | |
"""Reads Gene ID mapping files | |
E.g. acanthocheilonema_viteae nAv.1.0.1.g00113 nAv.1.0.1.t00113-RA nAv.1.0.1.t00113-RA nAv.1.0.1.t00113-RA nAvx1x0x1xt00113-RA""" | |
if self.verbose: print ("Reading id mapping files..") | |
geneIDMappingFolder = self.geneIDMappingFolder | |
geneIDMapping = {} #key -> species, val -> dict: key -> geneID(interpro), val -> list of other geneIDs #this is to connect genes in interproscan to genes in families.txt | |
geneIDMappingAllEntries = {} #key -> mRNA ID, val -> list of other IDs | |
#Note: not all genes are present in the interproscan files, so the total number of genes for a species here is not representative of the full dataset. | |
filesToProcess = glob(geneIDMappingFolder+"/*_id_mapping.txt") | |
for fi in filesToProcess: | |
f = open(fi,"r") | |
fileName = fi.split("/")[-1].replace("_id_mapping.txt","") | |
if fileName not in geneIDMapping: | |
geneIDMapping[fileName] = {} | |
for line in f: | |
if line.startswith("SPECIES_NAME"): | |
continue | |
line = line.strip() | |
spl = line.split("\t") | |
iprID = spl[5] | |
mRNA = spl[2] | |
gene = spl[1] | |
geneIDMappingAllEntries[mRNA] = spl[1:] | |
if iprID != "NA": | |
if iprID not in geneIDMapping[fileName]: | |
geneIDMapping[fileName][iprID] = [] | |
geneIDMapping[fileName][iprID] = spl[1:-1] #first column is species name | |
f.close() | |
if self.verbose: print ("Total species with id mapping info:\t%s" % (len(geneIDMapping)) ) | |
if self.verbose: print ("Total genes with id mapping info:\t%s" % (len(geneIDMappingAllEntries)) ) | |
self.geneIDMapping = geneIDMapping | |
self.geneIDMappingAllEntries = geneIDMappingAllEntries | |
def readInterproGFFFolder(self): | |
"""Reads interproscan GFF files for all species. The files can be gzipped.""" | |
if self.verbose: print ("Reading interproscan files..") | |
interproscanGFFFolder = self.interproscanGFFFolder | |
tagSpeciesName = self.tagSpeciesName | |
geneIDMapping = self.geneIDMapping | |
interproscanData = {} #key -> species, value -> dict; key -> gene, value -> dict; key-> source, val-> all lines attributed to that gene/source | |
filesToProcess = glob(interproscanGFFFolder+"/*ipr*") | |
IDsNotFound = {} | |
for fi in filesToProcess: | |
if fi.endswith(".gz"): | |
# f = gzip.open(fi, 'rb').read().decode('ascii').split("\n") #this might be memory hungry | |
f = gzip.open(fi, 'rb').read().decode('utf-8').split("\n") #this might be memory hungry # bh4 changed this to utf-8 because ascii fails for the new ipr file for T.regenti | |
else: | |
f = open(fi,"r") | |
##e.g. file name /nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz | |
tag = fi.split("/")[-1].split(".")[0] | |
species = tagSpeciesName[tag] | |
if species not in interproscanData: | |
interproscanData[species] = {} | |
if species not in geneIDMapping: | |
warn("Warning: %s not in geneIDMapping" % species) | |
for line in f: | |
if "##FASTA" in line: | |
break #I do not want to read the fasta part of GFF. #I assume that after this line appears there will be no more GFF/Interproscan entries | |
if "##" in line: | |
continue | |
line = line.strip() | |
spl = line.split("\t") | |
if len(spl) < 8: #means is not proper GFF line | |
continue | |
source = spl[1] | |
if source not in self.targetSources: | |
continue | |
iprID = spl[0] | |
# if iprID not in geneIDMapping[species]: | |
# if species not in IDsNotFound: | |
# IDsNotFound[species] = set() | |
# IDsNotFound[species].add(iprID) | |
if iprID in geneIDMapping[species]: #It can happen that a gene is not present in geneIDMapping, this is because some interproscan files are not updated, containing genes that have since been removed for compara. | |
geneID = geneIDMapping[species][iprID][0] | |
# else: | |
# warn("IPR ID not in gene mapping file.") | |
if geneID not in interproscanData[species]: | |
interproscanData[species][geneID] = {} | |
if source not in interproscanData[species][geneID]: | |
interproscanData[species][geneID][source] = [] | |
interproscanData[species][geneID][source].append(line) | |
#pprint(interproscanData[species]) | |
#Validation, number of Pfam domains in file | |
#/nfs/helminths02/analysis/50HGP/01INTERPRO/ACOC.protein.fa.gz.fas.ipr.gz 13886 | |
# dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs$ grep Pfam ACOC.protein.fa.gz.fas.ipr | wc -l | |
# 13886 | |
# print ("IDs on IPR GFF but not on ID Mapping file!",len(IDsNotFound)) | |
# for sp in IDsNotFound: | |
# print (sp,len(list(IDsNotFound[sp])) | |
# print (IDsNotFound[sp]) | |
if self.verbose: print ("Total species with interproscan data:\t%s" % len(interproscanData) ) | |
self.interproscanData = interproscanData | |
def readBranchLengths(self): | |
"""Reads branch lengths file. Grabs median. | |
E.g. 22454 0.125 0.081 0.204589,0.263049,0.078644,0.113182,0.282062,0.03627,0.105361,0.177002,0.089029,0.129357,0.076138,0.102818""" | |
if self.verbose: print ("Reading family branch lengths file..") | |
branchLengthsFile = self.branchLengthsFile | |
branchLengths = {} | |
for line in branchLengthsFile: | |
line = line.strip() | |
familyID,mean,median,lista = line.split("\t") | |
familyID = int(familyID) | |
if familyID not in branchLengths: | |
branchLengths[familyID] = 0 | |
branchLengths[familyID] = float(median) | |
if self.verbose: print ("Total families with branch lengths:\t%s" % len(branchLengths) ) | |
self.branchLengths = branchLengths | |
def readOutgroupList(self): | |
"""Reads file with outgroup / node ids to exclude from analysis. ID\tspecies/node-name\twhether_is_species_or_node | |
e.g. 40001001 amphimedon_queenslandica species""" | |
if self.verbose: print ("Reading file with outgroups to remove..") | |
removeOutgroupsFile = open(self.removeOutgroupsFile,"r") | |
removeOutgroups = [] | |
speciesToRemove = [] | |
for line in removeOutgroupsFile: | |
line = line.strip() | |
spl = line.split("\t") | |
removeOutgroups.append(spl[1]+spl[0]) | |
if spl[2] == "species": | |
speciesToRemove.append(spl[1]) | |
if self.verbose: print ("Total outgroup nodes/leaves to be removed:\t%s" % len(removeOutgroups) ) | |
if self.verbose: print ("Total species (leaves) to be removed:\t%s" % len(speciesToRemove) ) | |
self.speciesToRemove = speciesToRemove | |
self.removeOutgroups = removeOutgroups | |
def readSpeciesTree(self): | |
"""Reads newick species tree. Node/leaf names must have at least one alphabethic character, otherwise if only numeric Phylo will read names as being branch lengths. Branch lengths are not required. | |
This outputs a tree with parent-child relationships. Will only work for simple binary trees. | |
Also returns number taxons below each node based on the tree.""" | |
if self.verbose: print ("Reading species tree..") | |
speciesTreeFile = self.speciesTreeFile | |
numTaxonBelow = {} | |
sppTaxonBelow = {} | |
tree = Phylo.read(speciesTreeFile, 'newick') | |
speciesTree = findLastCommonAncestor.convertTreeToDict(tree) #in python dictionary format | |
reverseDict = findLastCommonAncestor.reverseDict(speciesTree) | |
allNodes = list(speciesTree.keys()) | |
allNodes.extend(list(speciesTree.values()) ) | |
allNodes = set(allNodes) | |
for node in allNodes: | |
numTaxonBelow[node] = int(findLastCommonAncestor.countLeaves(reverseDict,node)) | |
if self.verbose: print ("Total nodes/leaves in species tree:\t%s" % len(speciesTree) ) | |
if self.verbose: print ("Total nodes/leaves with number of taxon below:\t%s" % len(numTaxonBelow) ) | |
self.numTaxonBelow = numTaxonBelow | |
self.speciesTree = speciesTree | |
leaves = [leaf.name for leaf in tree.get_terminals()] | |
listOfSpecies = [] | |
for leaf in leaves: | |
listOfSpecies.append(leaf.split("400")[0]) | |
self.listOfSpecies = listOfSpecies | |
def writeFamiliesPerTaxonomyLevel(self): | |
"""If filtering out some species/nodes, I need to calculate new root for these altered compara families, and create an updated familiesPerTaxonomyLevel. The root (branchName) will then be taken from this file. | |
This has to be run after readFamilies. The external 'findLastCommonAncestor' functions have been tested previously. | |
This function has been properly validated: the created file is the same as the one from Compara, if not removing any outgroups.""" | |
#e.g. | |
#Hymenolepis microstoma 40001043 : 841230,912843,1018527,1058154,1059819 | |
#Strongyloides 40001088 : 269778,518849,529306,576845,647355,668201,705124,735608 | |
if self.verbose: print ("Creating familiesPerTaxonomyLevel.txt with recalculated roots..") | |
familiesDict = self.familiesDict | |
speciesTree = self.speciesTree | |
tmpFile = open("tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt","w") | |
speciesKeys = list(speciesTree.keys()) | |
familiesTaxonLevel = {} #this is temporary structure to store what will be written in tmp file | |
#loop through each family, calculate new root and write in file. | |
for familyID in familiesDict: | |
species = list(familiesDict[familyID].keys()) #these are e.g. clonorchis_sinensis, but we need to match to speciesTree clonorchis_sinensis40001019 | |
speciesKey = "" | |
renamedSpecies = [] | |
for sp in species: | |
for key in speciesKeys: | |
if sp in key: | |
renamedSpecies.append(key) | |
assert (len(species) == len(renamedSpecies)) #otherwise species-tree species names do not match families.txt species names.. | |
rootNode = findLastCommonAncestor.run(renamedSpecies,speciesTree) | |
#rename rootNode so that it resembles the same as in familiesPerTaxonomyLevel. | |
#This bit may be particular to 50HG Compara. But it should not matter if other database/ID type, simply the file with look differently. | |
if "4000" in rootNode: | |
spl = rootNode.partition("4000") | |
name = spl[0] | |
ID = "".join(spl[1:]) | |
if "_" in name: | |
name = name.replace("_"," ") | |
name = name[0].upper()+name[1:] | |
rootNode = name+" "+ID | |
if rootNode not in familiesTaxonLevel: | |
familiesTaxonLevel[rootNode] = [] | |
familiesTaxonLevel[rootNode].append(str(familyID) ) | |
#write to new file | |
for taxon in familiesTaxonLevel: | |
tmpFile.write(taxon+" : "+",".join(familiesTaxonLevel[taxon])+"\n") | |
tmpFile.close() | |
if self.verbose: print ("Total number of node/species in new familiesPerTaxonomyLevel file:\t%s" % len(familiesTaxonLevel)) #this should be equal to: total nodes+species in species tree (180 for 50HG) minus the removed ones. | |
self.familiesPerTaxonomyLevelFile = tmpFile.name #change familiesPerTaxonomyLevelFile file pointer to newly created file | |
def readFlagFamilyList(self): | |
"""Read file with list of families to be flagged.""" | |
if self.verbose: print ("Reading file with families to flag with 'exclude'..") | |
flagFamilyListFile = open(self.flagFamilyListFile,"r") | |
familiesToFlag = [int(familyID.strip()) for familyID in flagFamilyListFile] | |
self.familiesToFlag = familiesToFlag | |
if self.verbose: print ("Total number of families to be flagged: %s" % (len(familiesToFlag)) ) | |
def readGeneFilterList(self): | |
"""Read file with list of genes to be excluded from measures.""" | |
if self.verbose: print ("Reading file with genes to exclude..") | |
geneIDMappingAllEntries = self.geneIDMappingAllEntries #mRNA ID to other IDs | |
geneFilterListFile = open(self.geneFilterListFile,"r") | |
#convert mRNA ID to gene ID, to match families | |
geneFilterList = {geneIDMappingAllEntries[geneID.strip()][0] for geneID in geneFilterListFile} | |
self.geneFilterList = geneFilterList | |
if self.verbose: print ("Total number of genes to be excluded: %s" % (len(geneFilterList)) ) | |
def readCladeFile(self): | |
"""Read clade file, there must be a clade per each species""" | |
if self.verbose: print ("Reading clade file..") | |
cladeFile = self.cladeFile | |
listOfSpecies = self.listOfSpecies | |
cladePerSpecies = {} #key -> species, val -> clade | |
speciesPerClade = {} #key -> clade, value -> species in the clade (based on the species tree) | |
for line in cladeFile: | |
line = line.strip() | |
spl = line.split("\t") | |
sp = spl[0] | |
clade = spl[1] | |
sp = sp.lower() | |
sp = sp.replace(" ","_") | |
if sp in listOfSpecies: | |
cladePerSpecies[sp] = clade | |
if clade not in speciesPerClade: | |
speciesPerClade[clade] = [] | |
speciesPerClade[clade].append(sp) | |
self.cladePerSpecies = cladePerSpecies | |
self.speciesPerClade = speciesPerClade | |
if self.verbose: print ("Total number of species in clade file: %s" % (len(cladePerSpecies)) ) | |
def readNodeInfoFile(self): | |
"""Reads file with information on leaves for each node. E.g. | |
node amphimedon_queenslandica40001001 has parent metazoa40001000 | |
node euteleostomi40001177 has descendants danio_rerio40001178,homo_sapiens40001179 | |
Retrieves the set of Clades present for each node. All species of that clade need to be present for a clade to be considered (e.g. paraphyletic).""" | |
if self.verbose: print ("Reading node info file..") | |
nodeInfoFile = self.nodeInfoFile | |
speciesPerClade = self.speciesPerClade | |
cladePerSpecies = self.cladePerSpecies | |
nodeInfo = {} #key -> node ID, list of clades potentially present under that node. | |
nodeInfoSpp = {} #key -> node ID, list of species present under that node. | |
for line in nodeInfoFile: | |
line = line.strip() | |
if "descendants" in line: | |
spl = line.split(" ") | |
nodeID = spl[1] | |
descendants = spl[4].split(",") | |
cladeList = [] | |
for des in descendants: | |
des = des.split("400")[0] | |
cladeList.append(cladePerSpecies[des]) | |
descendantClades = set() | |
for clade in speciesPerClade: | |
if cladeList.count(clade) == len(speciesPerClade[clade]): #to make sure the whole clade is complete. I should only count a descendant clade if all species are present | |
descendantClades.add(clade) | |
nodeInfo[nodeID] = sorted(list(descendantClades)) | |
nodeInfoSpp[nodeID] = [des.split("400")[0] for des in descendants] | |
self.nodeInfo = nodeInfo | |
self.nodeInfoSpp = nodeInfoSpp | |
if self.verbose: print ("Total number of tree nodes with leaf info: %s" % (len(nodeInfo)) ) | |
def readProteinLengthFiles(self): | |
"""Read list of all genes and their length""" | |
if self.verbose: print ("Reading protein lengths folder..") | |
proteinLengthsFolder = self.proteinLengthsFolder | |
proteinLengths = {} #key -> species, val -> dict. Key -> gene id, val -> length | |
filesToProcess = glob(proteinLengthsFolder+"/*_protein_lengths.txt") | |
count = 0 | |
for f in filesToProcess: | |
inFile = open(f,"r") | |
spName = f.split("/")[-1].replace("_protein_lengths.txt","") | |
if spName not in proteinLengths: | |
proteinLengths[spName] = {} | |
for line in inFile: | |
gene,geneLength = line.strip().split("\t") | |
proteinLengths[spName][gene] = geneLength | |
count+=1 | |
inFile.close() | |
self.proteinLengths = proteinLengths | |
if self.verbose: print ("Total number of species with protein lengths: %s and total genes: %s" % (len(proteinLengths),count) ) | |
def statsPerFamily(self): | |
"""Main function to write stats for each family""" | |
if self.verbose: print ("Making stats..") | |
#1) all global data structures in use #just wrote this to easier localization and make sure I don't change the values | |
familiesDict = self.familiesDict | |
familiesTaxon = self.familiesTaxon | |
numTaxonBelow = self.numTaxonBelow | |
familiesGeneDuplication = self.familiesGeneDuplication | |
familiesGeneDuplicationMaxNode = self.familiesGeneDuplicationMaxNode | |
familiesGeneLosses = self.familiesGeneLosses | |
interproscanData = self.interproscanData | |
freeLivingSpecies = self.freeLivingSpecies | |
branchLengths = self.branchLengths | |
familiesToFlag = self.familiesToFlag | |
cladePerSpecies = self.cladePerSpecies | |
speciesPerClade = self.speciesPerClade | |
listOfSpecies = self.listOfSpecies | |
nodeInfo = self.nodeInfo | |
nodeInfoSpp = self.nodeInfoSpp | |
familiesDictLengths = self.familiesDictLengths | |
#2) output file headers | |
self.outputFile.write("familyID\tFlag\tn_species\tn_genes\tMean_genes_per_species\tMedian_genes_per_species\tVariation_coefficient_n_genes_per_species\tn_paralogs\tBranch_name\tCompleteness_score\tTotal_duplications\tMax_duplications_node\tMost_frequent_species\t") | |
for species in sorted(freeLivingSpecies): | |
self.outputFile.write(species+"\t") | |
self.outputFile.write("Total_losses\tMedian_branch_length\tPfam_perc_genes\tTMHMM_perc_genes\tSignalP_perc_genes\tPfam_perc_in_family\tPfam_all_in_family\n") | |
outputFileMeasures = open(self.outputFile.name+"_measures","w") #special file just for the measures | |
outputFileMeasures.write("familyID\tFlag\tn_species\tn_genes\tmean_prot_len\tsum_prot_len\tSpecies_var_coef\tSpecies_var_coef_zeroes\tVar_coef_lengths_zeroes\t") | |
for clade in sorted(speciesPerClade): | |
outputFileMeasures.write("zscore_%s\t" % clade) | |
outputFileMeasures.write("Max_zscore_clade\tMax_zscore\t") | |
for clade in sorted(speciesPerClade): | |
outputFileMeasures.write("enrich_%s\t" % (clade) ) | |
outputFileMeasures.write("Max_enrich_clade\tMax_enrich\t") | |
outputFileMeasures.write("Pfam_perc_in_family\tPfam_all_in_family\n") | |
#3) Loop through each compara family, and produce all stats for this family | |
count = 0 #count processed families | |
for family in familiesDict: | |
count+=1 | |
#3.1) General basic stats | |
nSpecies = len(familiesDict[family]) #note that this familiesDict only contain species entry if any gene in that family (does not count zeroes) | |
nGenesPerSpecies = [len(familiesDict[family][sp]) for sp in familiesDict[family]] | |
proteinLengthsSpecies = [familiesDictLengths[family][sp] for sp in familiesDictLengths[family]] #this is list of protein lenghts sums | |
nGenes = sum(nGenesPerSpecies) | |
sumProteinLengths = sum(proteinLengthsSpecies) | |
avgProteinLenPerGene = "%.1f" % (sumProteinLengths/nGenes) | |
nParalogs = nGenes - nSpecies | |
avgGenesPerSpecies = "%.1f" % (mean(nGenesPerSpecies)) | |
medianGenesPerSpecies = "%.1f" % (median(nGenesPerSpecies)) | |
stdevGenesPerSpecies = std(nGenesPerSpecies,ddof=1) | |
# 21-Aug-2015: Variation coefficient = Stdev of number of genes in species with at least one gene / mean of number of genes in species with at least one gene | |
# this excludes the zeroes (species without any gene in that family) | |
stdevDivMean = "%.1f" % (stdevGenesPerSpecies/float(avgGenesPerSpecies)) # Variations in numbers of genes per family across species -> stdev / mean | |
#3.2) tag families to exclude | |
if family in familiesToFlag: | |
familyFlag = "exclude" | |
else: | |
familyFlag = "" | |
#3.3) Calculate/retrieve Branch name, median branch length and completeness score | |
branchName = familiesTaxon[family] | |
if family in branchLengths: | |
medianBranchLength = branchLengths[family] | |
else: #this should not happen except for a couple of known exceptions e.g. family 1109776 when only 33 species, because pruning did not work. #as I do not know if there are/will be other exceptions I decided to put "NA" when it does not work. | |
medianBranchLength = "NA" | |
nSpeciesBelowTaxon = numTaxonBelow[branchName] | |
completeness = "%.2f" % (nSpecies/nSpeciesBelowTaxon) | |
#Validation completeness for family: 80624. On James plot shows 0.5 completeness score. VERIFIED 80624 41 319 7.8 2.0 278 Metazoa40001000 0.5 | |
#Validation completeness for family: 63382. On James plot shows <0.25 completeness score. VERIFIED 63382 10 393 39.3 36.5 383 Chromadorea40001076 0.2 | |
#3.4) recalculate species variation coefficient, having insight from phylogeny. | |
#filling the zeroes when a species should have a gene (based on tree root) but does not | |
zeroesMissing = nSpeciesBelowTaxon - nSpecies #species that should have a gene but don't, based on the family species root | |
assert(zeroesMissing >= 0) | |
nGenesPerSpeciesZeroes = nGenesPerSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:] | |
for i in range(zeroesMissing): | |
nGenesPerSpeciesZeroes.append(0) | |
stdnGenesPerSpeciesZeroes = std(nGenesPerSpeciesZeroes,ddof=1) | |
meannGenesPerSpeciesZeroes = mean(nGenesPerSpeciesZeroes) | |
speciesVarCoefZeroes = "%.1f" % (stdnGenesPerSpeciesZeroes / meannGenesPerSpeciesZeroes ) | |
#variation coefficient for protein lengths | |
proteinLengthsSpeciesZeroes = proteinLengthsSpecies[:] #nGenesPerSpeciesZeroes = nGenesPerSpecies[:] | |
for i in range(zeroesMissing): | |
proteinLengthsSpeciesZeroes.append(0) | |
stdProteinLengthsSpeciesZeroes = std(proteinLengthsSpeciesZeroes,ddof=1) | |
meanProteinLengthsSpeciesZeroes = mean(proteinLengthsSpeciesZeroes) | |
speciesVarCoefZeroesLengths = "%.1f" % (stdProteinLengthsSpeciesZeroes / meanProteinLengthsSpeciesZeroes ) | |
#3.5) retrieve gene losses and gene duplication data | |
if family in familiesGeneDuplication: | |
duplications = familiesGeneDuplication[family] | |
else: | |
duplications = 0 | |
if family in familiesGeneLosses: | |
losses = familiesGeneLosses[family] | |
else: | |
losses = 0 | |
if family in familiesGeneDuplicationMaxNode: | |
maxDuplicationNode = familiesGeneDuplicationMaxNode[family] | |
else: | |
maxDuplicationNode = "None" | |
#initialize interproscan-related items and others | |
nGenesWithPfam = 0 #at least 1 Pfam entry | |
nGenesWithSignalP = 0 #at least 1 SignalP entry | |
nGenesWithTMHMM = 0 #at least 1 TMHMM entry | |
pfamTerms = [] # this is to store one pfam domain example per gene | |
pfamTermsAll = [] # this is to store all pfam domains in a gene (not just one per gene) | |
maxSp = "" #the species with most number of genes | |
maxSpProtLen = 0 | |
protLenPerClade = {} #key -> clade, val -> list of protein lengths in species of that clade . note that I only initialize a clade if any species on that clade, and only contains data on species with at least one gene. | |
protLenPerSp = {} #key -> sp, val -> prot length | |
#3.6) Loop through each species in gene family. To retrieve interproscan info and make other calculations | |
for species in sorted(familiesDict[family]): | |
#ngenesSp = len(familiesDict[family][species]) #if calculating measures based on gene numbers | |
protLenSp = familiesDictLengths[family][species] #if calculating measures using protein length instead of gene numbers, without changing all variable names.. | |
protLenPerSp[species] = familiesDictLengths[family][species] #redundant but needed #only initializes species if any gene | |
#3.7) Get genes per clade | |
clade = cladePerSpecies[species] | |
if clade not in protLenPerClade: | |
protLenPerClade[clade] = [] | |
protLenPerClade[clade].append(protLenSp) | |
#3.8) get measure of species with most genes in family | |
if protLenSp > maxSpProtLen: | |
maxSpProtLen = protLenSp | |
maxSp = species | |
# #testing: block start. Uncomment this when running for real | |
#3.9) get interproscan data | |
for gene in familiesDict[family][species]: | |
if gene in interproscanData[species]: #it is normal that a gene may not have any interproscan entries. | |
#Validation. Gene present in IPR file and with correct description | |
#Python: syphacia_muris SMUV_0000960301 {'Pfam': ['SMUV_0000960301-mRNA-1\tPfam\tprotein_match\t71\t295\t4.6E-58\t+\t.\tName=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254"']} | |
#Farm: /lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep SMUV_0000960301 SMUV.protein.fa.gz.fas.ipr | |
# SMUV_0000960301-mRNA-1 Pfam protein_match 71 295 4.6E-58 + . Name=PF00089;signature_desc=Trypsin;Target=SMUV_0000960301-mRNA-1 71 295;status=T;ID=match$342_71_295;Ontology_term="GO:0004252","GO:0006508";date=04-06-2014;Dbxref="InterPro:IPR001254" | |
typesOfEntries = list(interproscanData[species][gene].keys()) #this is dictionary entries, therefore no duplications | |
assert (len(typesOfEntries) <= len(self.targetSources)) | |
for entryType in typesOfEntries: | |
if "Pfam" in entryType: | |
nGenesWithPfam+=1 | |
spl = "".join(interproscanData[species][gene][entryType]).split(";") #there may be several lines of pfam domains for same gene | |
#Note: the presence of pfam entries for the same gene in the IPR GFF is random, (i.e. the last entry is not always the most N-terminal domain etc, its just random), | |
#Note(cont): meaning that picking only 1 pfam example for each gene is randomized (but the same everytime we run this) | |
##originally I was picking only one pfam domain per gene | |
desc = "" | |
pfamTermsSet = set() | |
for item in spl: | |
if item.startswith("signature_desc="): | |
desc = item.replace("signature_desc=","") | |
pfamTermsSet.add(desc) | |
for term in pfamTermsSet: | |
pfamTerms.append(term) | |
##then I also added column where we have all pfam domains in gene | |
for item in spl: | |
desc = "" | |
if item.startswith("signature_desc="): | |
desc = item.replace("signature_desc=","") | |
pfamTermsAll.append(desc) | |
if "SignalP" in entryType: #this can in principle match any SignalP_* entry | |
nGenesWithSignalP+=1 | |
if "TMHMM" in entryType: | |
nGenesWithTMHMM+=1 | |
# else: #dr7@farm3-head2:/lustre/scratch108/parasites/dr7/50HG/50HG_100K_families_parsing/interproscanGFFs_testing$ grep HNAJ_0000929301 HNAJ.protein.fa.gz.fas.fas.sl.ipr | |
# print (species,gene) | |
#testing: block end. Uncomment this when running for real | |
mostFrequentSpecies = str(maxSpProtLen)+"X:"+maxSp #species with most genes/prot length, and how many | |
#3.10) calculations on interproscan data #note this really looks at genes, not at protein length. Stats are gene-based, not altered for protein length. | |
assert (nGenesWithPfam <= nGenes) | |
assert (nGenesWithTMHMM <= nGenes) | |
assert (nGenesWithSignalP <= nGenes) | |
percPfam = "%.1f" % (nGenesWithPfam*100.0/nGenes) | |
percSignalP = "%.1f" % (nGenesWithSignalP*100.0/nGenes) | |
percTMHMM = "%.1f" % (nGenesWithTMHMM*100.0/nGenes) | |
##Getting 3 most common pfam terms, and % genes in family with that pfam | |
pfams = {} | |
pfamText = "" | |
for term in pfamTerms: | |
if term not in pfams: | |
pfams[term] = float("%.1f" % (pfamTerms.count(term)*100.0/nGenes ) ) | |
pfamsSorted = sorted(pfams, key=pfams.get, reverse=True) #need numeric sorting | |
for term in pfamsSorted[0:3]: #top 3 pfam domains #if less than 3 exist will simply pick less | |
pfamText+=str(pfams[term])+"%:"+term+";" | |
if pfamText == "": | |
pfamText = "100%:None" | |
##Getting all pfam domains in family statistics, and how many times | |
pfamsAll = {} | |
pfamTextAll = "" | |
for term in pfamTermsAll: | |
if term not in pfamsAll: | |
pfamsAll[term] = pfamTermsAll.count(term) | |
pfamsSortedAll = sorted(pfamsAll, key=pfamsAll.get, reverse=True) | |
for term in pfamsSortedAll: | |
pfamTextAll+=str(pfamsAll[term])+"X:"+term+";" | |
if pfamTextAll == "": | |
pfamTextAll = "0X:None" | |
#3.11) calculate ranking measures | |
### IMPORTANT NOTE ### 14-Sep: Even though variable names and comments still say "genes", it may actually be using protein lengths. | |
#Note that all these measures count zeroes. | |
#E.g. I know the species tree node corresponding to root of the gene tree. Therefore I always take into account all the species AND clades that SHOULD be present in the calculation. | |
##3.11.1) Calculate Z-score for each clade. Z-score = (mean prot length per species in a clade - mean prot length per species)/(std of prot length per species) | |
#This only works if more than 1 clade. Otherwise, all entries have 'NA' | |
#I always check root and calculate for each clade descendant of the root. Other clades will have 'NA'. | |
#note that I always divide by all species in the clade, regardless all of them have a gene or not | |
#note: standard deviation can be zero (e.g. all species having same number prot length), and we cannot divide by zero. I tag these cases with "NA_std_is_zero" | |
#get clades that should be present. | |
if branchName in nodeInfo: #if not, it means it is a leaf, not an internal node | |
descendantClades = nodeInfo[branchName] | |
else: | |
assert("_" in branchName) | |
descendantClades = [] | |
#calcualte mean genes per clade #adding zeroes if species does not have gene | |
meanProtLenPerClade = {} #key -> clade, value -> mean prot length per species in that clade (total, including zeroes) #note this dict only contains clade entries if they should have any value | |
sdProtLenOutsideClade = {} #key -> clade, value -> std dev of prot length per species that are *not* in this clade. Added Avril Coghlan 14-Jan-2016 | |
for clade in descendantClades: | |
if clade in protLenPerClade: | |
meanProtLenPerClade[clade] = sum(protLenPerClade[clade]) / float(len(speciesPerClade[clade] ) ) | |
# calculate the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016 | |
otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes. Added Avril Coghlan 14-Jan-2016 | |
for species in nodeInfoSpp[branchName]: #species under the tree root. Added Avril Coghlan 14-Jan-2016. | |
if cladePerSpecies[species] == clade: # Added Avril Coghlan 14-Jan-2016 | |
continue # Added Avril Coghlan 14-Jan-2016 | |
else: # Added Avril Coghlan 14-Jan-2016 | |
if species in protLenPerSp: #protLenPerSp only contains species entry if any gene. Added Avril Coghlan 14-Jan-2016. | |
otherSpecies.append(protLenPerSp[species]) # Added Avril Coghlan 14-Jan-2016. | |
else: # Added Avril Coghlan 14-Jan-2016. | |
otherSpecies.append(0) # Added Avril Coghlan 14-Jan-2016. | |
stdProteinLengthsSpeciesZeroesOutsideClade = std(otherSpecies, ddof=1) # Added Avril Coghlan 14-Jan-2016. Used ddof=1 as Diogo has used this elsewhere for std. dev. | |
sdProtLenOutsideClade[clade] = stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016. | |
else: #if no species have it | |
meanProtLenPerClade[clade] = 0 | |
sdProtLenOutsideClade[clade] = 0 # Added by bh4 - 17-Jan-2016 | |
assert(set(meanProtLenPerClade.keys()) == set(descendantClades) ) | |
#actually calculate z-scores | |
zscoreResults = {} #key -> clade, val -> zscore | |
maxZscore = 0 #will only count positive Zscores | |
maxZclade = "NA" | |
for clade in sorted(speciesPerClade): #just so I loop over all clades | |
if len(descendantClades) < 2: #cannot calculate clade Z-score if only one clade | |
zscoreResults[clade] = "NA" | |
else: | |
if clade in meanProtLenPerClade: | |
# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016. | |
stdProteinLengthsSpeciesZeroesOutsideClade = sdProtLenOutsideClade[clade] # Added Avril Coghlan 14-Jan-2016. | |
# if stdnGenesPerSpeciesZeroes == 0: Commented out Avril Coghlan 14-Jan-2016 | |
if stdProteinLengthsSpeciesZeroesOutsideClade == 0: # Added Avril Coghlan 14-Jan-2016 | |
zscoreResults[clade] = "NA_std_is_zero" | |
else: | |
# use the standard deviation of protein lengths in species outside this clade. Added Avril Coghlan 14-Jan-2016. | |
zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroesOutsideClade # Added Avril Coghlan 14-Jan-2016 | |
# zscore = (meanProtLenPerClade[clade] - meanProteinLengthsSpeciesZeroes) / stdProteinLengthsSpeciesZeroes. Commented out Avril Coghlan 14-Jan-2016 | |
#validated using http://ncalculators.com/statistics/z-score-calculator.htm | |
zscoreResults[clade] = zscore | |
if zscore > maxZscore: | |
maxZscore = zscore | |
maxZclade = clade | |
else: | |
zscoreResults[clade] = "NA" | |
maxZscore = "%.2f" % maxZscore | |
#Z-score calculation validation: | |
# ##Mono-clade family | |
# family 1212416 : NECAME_04199 (necator_americanus) NBR_0000394301 (nippostrongylus_brasiliensis) NBR_0001482701 (nippostrongylus_brasiliensis) NBR_0002215501 (nippostrongylus_brasiliensis) | |
# 1212416 0.5 0.5 NA NA NA NA NA NA NA 0.0 | |
# #CORRECT | |
# ##Two-neighbour-clade family | |
# family 851973 : csin104847 (clonorchis_sinensis) csin112867 (clonorchis_sinensis) D915_09852 (fasciola_hepatica) D915_09853 (fasciola_hepatica) SSLN_0001350701 (schistocephalus_solidus) Sjp_0074790 (schistosoma_japonicum) Sjp_0074800 (schistosoma_japonicum) Smp_166880 (schistosoma_mansoni) | |
# Em Ts Hm Mc Ss Cs Fh Sj Sm | |
# 0 0 0 0 1 2 2 2 1 | |
# avg 0.888888889 zscore tapeworm -0.787400787 | |
# std 0.874889764 zscore fluke 0.984250984 | |
# tapeworm avg 0.2 | |
# fluke avg 1.75 | |
# #VALUES VALIDATED, MANUAL CALCULATION! | |
# ##Root with 6 clades, but genes in only 3 clades. | |
# descendants: {'IV', 'V', 'Tapeworm', 'III', 'I', 'Flukes'} data keys: ['I', 'IV', 'Flukes'] | |
# family 919890 : csin102733 (clonorchis_sinensis) D915_07255 (fasciola_hepatica) MhA1_Contig762.frz3.gene6 (meloidogyne_hapla) nRc.2.0.1.g27258 (romanomermis_culicivorax) Sjp_0091230 (schistosoma_japonicum) Smp_090010 (schistosoma_mansoni) | |
# #VALUES VALIDADED, MANUAL CALCULATION! | |
##3.11.2) Calculate clade gene enrichement (Avril's/Bhavana's measure) | |
# mean prot length per species in a clade/mean prot length per species in all other species | |
#Note that I use root to see which clades could be present, and I count zeroes on clades and species. When I say "all other species" it consists of the species that could have a gene based on root. | |
# enrichResults = {} #key -> clade, val -> zscore | |
# maxEnrich = 0 #will only count positive Zscores | |
# maxEnrichClade = "NA" | |
# for clade in sorted(speciesPerClade): #just so I loop over all clades | |
# if len(descendantClades) < 2: #cannot calculate if only one clade | |
# enrichResults[clade] = "NA" | |
# else: | |
# if clade in meanProtLenPerClade: #clades that should have genes (but may not have) | |
# otherSpecies = [] #list of number of genes for the species not in current clade, will include zeroes | |
# for cladeDesc in descendantClades: #loop over all clades that should/could contain a gene based on root, not just the ones that do have a gene | |
# if cladeDesc == clade: #if current clade | |
# continue | |
# if cladeDesc in protLenPerClade: #this means that there will be at least one species with a gene here | |
# otherSpecies.extend(protLenPerClade[cladeDesc]) | |
# missingZeroes = speciesPerClade[cladeDesc] - len(protLenPerClade[cladeDesc]) #the protLenPerClade only contains data on species with genes, therefore need to add zeroes | |
# for m in range(missingZeroes): | |
# otherSpecies.append(0) | |
# else: #add zeroes accordingly | |
# l = [0]*speciesPerClade[cladeDesc] | |
# otherSpecies.extend(l) | |
# #print (family, branchName, clade, len(otherSpecies),speciesPerClade[clade],nSpeciesBelowTaxon) | |
# assert (len(otherSpecies) + speciesPerClade[clade] == nSpeciesBelowTaxon) | |
# enrich = meanProtLenPerClade[clade] / mean(otherSpecies) #meannGenesPerSpeciesZeroes #mean(otherSpecies) | |
# enrichResults[clade] = enrich | |
# if enrich > maxEnrich: | |
# maxEnrich = enrich | |
# maxEnrichClade = clade | |
# else: #so that I write NA in clades which should not have genes in this family | |
# enrichResults[clade] = "NA" | |
# assert (len(enrichResults) == len(speciesPerClade)) | |
# maxEnrich = "%.2f" % maxEnrich | |
# ##Another approach to calculate clade enrichment, allowing paraphyletic groups. | |
#Note that if gene tree root is in between a paraphyletic group, the score is calculated for all members of the paraphyletic clade. | |
enrichResults = {} #key -> clade, val -> zscore | |
maxEnrich = 0 #max will only count positive Zscores | |
maxEnrichClade = "NA" | |
for clade in sorted(speciesPerClade): #loop over all clades, regardless of having gene or not, to fill all columns in output | |
if len(descendantClades) < 2: #cannot calculate if only one clade | |
enrichResults[clade] = "NA" | |
else: #calculate enrichment | |
if clade in meanProtLenPerClade: #clades that should have genes (but may not have) #note that meanProtLenPerClade already accounts for zeroes | |
otherSpecies = [] #list of protein lengths for the species not in current clade, will include zeroes | |
countSppInClade = 0 #just for validation #because of paraphyletic groups, the number of species in a clade may be different from number of species below gene tree root. | |
for species in nodeInfoSpp[branchName]: #species under the tree root | |
if cladePerSpecies[species] == clade: | |
countSppInClade+=1 | |
continue | |
else: | |
if species in protLenPerSp: #protLenPerSp only contains species entry if any gene | |
otherSpecies.append(protLenPerSp[species]) | |
else: | |
otherSpecies.append(0) | |
assert (len(otherSpecies) + countSppInClade == nSpeciesBelowTaxon) | |
enrich = meanProtLenPerClade[clade] / mean(otherSpecies) | |
enrichResults[clade] = enrich | |
if enrich > maxEnrich: | |
maxEnrich = enrich | |
maxEnrichClade = clade | |
else: #so that I write NA in clades which do not have genes in this family | |
enrichResults[clade] = "NA" | |
assert (len(enrichResults) == len(speciesPerClade)) | |
maxEnrich = "%.2f" % maxEnrich | |
##Confirmed that this approach gives the same results as the previous, if using same dataset. | |
#4) Writing to output file, one family at a time | |
#main output file | |
self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgGenesPerSpecies,medianGenesPerSpecies,stdevDivMean,nParalogs,branchName,completeness,duplications,maxDuplicationNode,mostFrequentSpecies) ) | |
##Presence in free-living species | |
for species in sorted(freeLivingSpecies): | |
if species in familiesDict[family]: | |
self.outputFile.write("%s\t" % len(familiesDict[family][species])) | |
else: | |
self.outputFile.write("0\t") | |
self.outputFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (losses,medianBranchLength,percPfam,percTMHMM,percSignalP,pfamText,pfamTextAll) ) | |
#measures file | |
outputFileMeasures.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" % (family,familyFlag,nSpecies,nGenes,avgProteinLenPerGene,sumProteinLengths,stdevDivMean,speciesVarCoefZeroes,speciesVarCoefZeroesLengths) ) | |
for clade in sorted(speciesPerClade): | |
outputFileMeasures.write("%s\t" % (zscoreResults[clade]) ) | |
outputFileMeasures.write("%s\t%s\t" % (maxZclade,maxZscore) ) | |
for clade in sorted(speciesPerClade): | |
outputFileMeasures.write("%s\t" % (enrichResults[clade]) ) | |
outputFileMeasures.write("%s\t%s\t" % (maxEnrichClade,maxEnrich) ) | |
outputFileMeasures.write("%s\t%s\n" % (pfamText,pfamTextAll) ) | |
self.outputFile.close() | |
outputFileMeasures.close() | |
if self.verbose: print ("Total number of families processed:\t%s" % count ) | |
def writeFamilies(self): | |
"""(Optional function) Writes the families (like in families.txt file) after all the filterings, the way they are actually processed. | |
On the original files there is no apparent gene order sorting. Here there will be species sorting. | |
Intended output example | |
------------------------------------------------------------------------------- | |
family 1584328 : ASU_08438 (ascaris_suum) ALUE_0001329201 (ascaris_lumbricoides) | |
------------------------------------------------------------------------------- | |
family 1584332 : nRc.2.0.1.g04834 (romanomermis_culicivorax) nRc.2.0.1.g36069 (romanomermis_culicivorax) | |
""" | |
if self.verbose: print ("Writing new families.txt file..") | |
familiesDict = self.familiesDict | |
dumpFamiliesFile = open(self.dumpFamiliesFile,"w") | |
count = 0 | |
for family in sorted(familiesDict): | |
count+=1 | |
text = "-------------------------------------------------------------------------------\n" | |
text+= "family %s :" % family | |
for species in sorted(familiesDict[family]): | |
for gene in sorted(familiesDict[family][species]): | |
text+=" %s (%s)" % (gene,species) | |
text+="\n" | |
dumpFamiliesFile.write(text) | |
dumpFamiliesFile.close() | |
if self.verbose: print ("Wrote %s families at %s" % (count,dumpFamiliesFile.name)) | |
def cleanUp(self): | |
"""removes temporary files""" | |
#pass | |
os.system("rm tmp_familiesPerTaxonomyLevel_"+self.jobID+".txt") | |
def dumpAll(self): | |
if self.verbose: print ("Writing pickle dump files..") | |
pickle.dump(self.familiesDict,open(self.dumpsFolder+"/familiesDict","wb") ) | |
pickle.dump(self.familiesTaxon,open(self.dumpsFolder+"/familiesTaxon","wb") ) | |
pickle.dump(self.numTaxonBelow,open(self.dumpsFolder+"/numTaxonBelow","wb") ) | |
pickle.dump(self.familiesGeneDuplication,open(self.dumpsFolder+"/familiesGeneDuplication","wb") ) | |
pickle.dump(self.familiesGeneDuplicationMaxNode,open(self.dumpsFolder+"/familiesGeneDuplicationMaxNode","wb") ) | |
pickle.dump(self.familiesGeneLosses,open(self.dumpsFolder+"/familiesGeneLosses","wb") ) | |
pickle.dump(self.tagSpeciesName,open(self.dumpsFolder+"/tagSpeciesName","wb") ) | |
pickle.dump(self.geneIDMapping,open(self.dumpsFolder+"/geneIDMapping","wb") ) | |
pickle.dump(self.interproscanData,open(self.dumpsFolder+"/interproscanData","wb") ) | |
pickle.dump(self.branchLengths,open(self.dumpsFolder+"/branchLengths","wb") ) | |
pickle.dump(self.cladePerSpecies,open(self.dumpsFolder+"/cladePerSpecies","wb") ) | |
pickle.dump(self.speciesPerClade,open(self.dumpsFolder+"/speciesPerClade","wb") ) | |
pickle.dump(self.speciesTree,open(self.dumpsFolder+"/speciesTree","wb") ) | |
pickle.dump(self.removeOutgroups,open(self.dumpsFolder+"/removeOutgroups","wb") ) | |
pickle.dump(self.speciesToRemove,open(self.dumpsFolder+"/speciesToRemove","wb") ) | |
pickle.dump(self.familiesToFlag,open(self.dumpsFolder+"/familiesToFlag","wb") ) | |
pickle.dump(self.listOfSpecies,open(self.dumpsFolder+"/listOfSpecies","wb") ) | |
pickle.dump(self.geneIDMappingAllEntries,open(self.dumpsFolder+"/geneIDMappingAllEntries","wb") ) | |
pickle.dump(self.geneFilterList,open(self.dumpsFolder+"/geneFilterList","wb") ) | |
pickle.dump(self.nodeInfo,open(self.dumpsFolder+"/nodeInfo","wb") ) | |
pickle.dump(self.nodeInfoSpp,open(self.dumpsFolder+"/nodeInfoSpp","wb") ) | |
pickle.dump(self.proteinLengths,open(self.dumpsFolder+"/proteinLengths","wb") ) | |
pickle.dump(self.familiesDictLengths,open(self.dumpsFolder+"/familiesDictLengths","wb") ) | |
# for key in vars(self): | |
# pickle.dump(eval("self."+key),open("dumps/"+str(key),"wb") ) | |
def loadAll(self,dumpPath): | |
if self.verbose: print ("Loading pickle dump files..") | |
#loading only the required data | |
self.familiesDict=pickle.load(open(dumpPath+"/familiesDict","rb") ) | |
self.familiesTaxon=pickle.load(open(dumpPath+"/familiesTaxon","rb") ) | |
self.numTaxonBelow=pickle.load(open(dumpPath+"/numTaxonBelow","rb") ) | |
self.familiesGeneDuplication=pickle.load(open(dumpPath+"/familiesGeneDuplication","rb") ) | |
self.familiesGeneDuplicationMaxNode=pickle.load(open(dumpPath+"/familiesGeneDuplicationMaxNode","rb") ) | |
self.familiesGeneLosses=pickle.load(open(dumpPath+"/familiesGeneLosses","rb") ) | |
self.interproscanData=pickle.load(open(dumpPath+"/interproscanData","rb") ) #can comment this to run faster, uncomment when running for real | |
self.branchLengths=pickle.load(open(dumpPath+"/branchLengths","rb") ) | |
self.cladePerSpecies=pickle.load(open(dumpPath+"/cladePerSpecies","rb") ) | |
self.speciesPerClade=pickle.load(open(dumpPath+"/speciesPerClade","rb") ) | |
self.familiesToFlag=pickle.load(open(dumpPath+"/familiesToFlag","rb") ) | |
self.listOfSpecies=pickle.load(open(dumpPath+"/listOfSpecies","rb") ) | |
self.nodeInfo=pickle.load(open(dumpPath+"/nodeInfo","rb") ) | |
self.nodeInfoSpp=pickle.load(open(dumpPath+"/nodeInfoSpp","rb") ) | |
self.familiesDictLengths=pickle.load(open(dumpPath+"/familiesDictLengths","rb") ) | |
def run(self): | |
"""Runs the functions in order.""" | |
self.readGeneIDMappingFolder() #this has to be run before the readGeneFilterList() | |
if self.removeOutgroupsFile != "": self.readOutgroupList() #this has to be run before readFamilies() | |
if self.geneFilterListFile != "": self.readGeneFilterList() #this has to be run before readFamilies() | |
self.readProteinLengthFiles() #this has to be run before readFamilies | |
self.readFamilies() | |
self.readSpeciesTree() | |
if self.removeOutgroupsFile != "": self.writeFamiliesPerTaxonomyLevel() #this has to be run after readFamilies() | |
if self.flagFamilyListFile != "": self.readFlagFamilyList() #this can be run anywhere before statsPerFamily() | |
self.readFamiliesPerTaxonomyLevel() #note that this will read a file created in this script | |
self.readGeneDuplicationsFolder() | |
self.readGeneLossesFolder() | |
self.readSpeciesNameTag() | |
self.readInterproGFFFolder() | |
self.readBranchLengths() | |
self.readCladeFile() | |
self.readNodeInfoFile() #has to be run after readCladeFile | |
if self.dumpsFolder != "": self.dumpAll() | |
self.statsPerFamily() | |
if self.dumpFamiliesFile != "": self.writeFamilies() | |
self.cleanUp() | |
def runLoad(self,dumpPath): | |
"""For script development. To use if just changing the statsPerFamily function.""" | |
if self.verbose: print ("Loading dumps..") | |
self.loadAll(dumpPath) #this loads are required objects to runs stats on them | |
self.statsPerFamily() | |
if self.dumpFamiliesFile != "": self.writeFamilies() | |
def main(): | |
import argparse | |
parser = argparse.ArgumentParser(description='Analysis suite for exploring Compara families.') | |
#positional args | |
parser.add_argument('comparaFamilies', metavar='comparaFamilies', type=str, help='Flat file. Compara families file. One family per line, list of genes/species. Output from get_all_50HG_families_v75.pl.') | |
parser.add_argument('familiesPerTaxonomyLevel', metavar='familiesPerTaxonomyLevel', type=str, help='Flat file. One taxonomy level per line, list of families. Output from get_families_per_taxonomy_level_v75.pl.') | |
parser.add_argument('speciesTree', metavar='speciesTree', type=str, help='Species tree in newick format. Each node should have an unique name, this expects species/node+id (e.g. dorylaimia40001065, romanomermis_culicivorax40001064). Output from get_species_tree_v75.pl.') | |
parser.add_argument('geneDuplicationsFolder', metavar='geneDuplicationsFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_duplications_per_taxonomy_level_v75.pl.') | |
parser.add_argument('geneLossesFolder', metavar='geneLossesFolder', type=str, help='Folder with a file for each taxon/node. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.') | |
parser.add_argument('speciesNameTag', metavar='speciesNameTag', type=str, help='TSV file. Locus tag - Species name correspondence. PREFIX Genus species VERSION.') | |
parser.add_argument('geneIDMappingFolder', metavar='geneIDMappingFolder', type=str, help='Folder with a *_id_mapping.txt TSV file for each species, with Gene IDs from different sources. SPECIES_NAME GENEMEMBER_ID SEQMEMBER_ID TRANSCRIPT_ID PROTEIN_ID IPRSCAN_GFF_ID.') | |
parser.add_argument('interproscanGFFFolder', metavar='interproscanGFFFolder', type=str, help='Folder with a *ipr* GFF file for each species. Files can be gzipped. File names should be e.g. ciona_intestinalis40001180.txt. Output from get_gene_losses_per_taxonomy_level_v75.pl.') | |
parser.add_argument('branchLenghts', metavar='branchLenghts', type=str, help='TSV file. One line per Compara family, ID in first column, average branch length of tree in second column, median in third column. Output from get_compara_branch_length.py.') | |
parser.add_argument('speciesClade', metavar='speciesClade', type=str, help='TSV file. One line per species, first column species name as in comparaFamilies, second column the clade name/number, third column either Parasitic or Freeliving. E.g. loa_loa\tIII\tParasitic') | |
parser.add_argument('nodeInfo', metavar='nodeInfo', type=str, help='File with leaves per each node in the tree. Works even if whole species tree was used (no need to prune). Produced by ~alc/Documents/git/Python/parse_tree_with_ETE.py') | |
parser.add_argument('proteinLengthsFolder', metavar='proteinLengthsFolder', type=str, help='Path to folder with a file for each species. Files need to have same names as species names and contain a list of all proteins and their respective length, in amino acids. Used also to recreate singletons lists. From get_protein_lengths_all_species_v75.pl.') | |
#optional argument | |
parser.add_argument('--outputFile', metavar='outputFile', default = os.getcwd()+"/comparaFamiliesParser.out", type=str, help='File where important output will be written (but you should also store stdout and stderr). If none given it will created file called comparaFamiliesParser.out in cwd.') | |
parser.add_argument('--verbose', metavar='verbose', default = 1, type=int, help='Verbose mode (1 = Yes, 0 = No). Default = 1.') | |
parser.add_argument('--removeOutgroups', metavar='removeOutgroups', default = "", type=str, help='Whether to remove outgropup species from results. Give file with list of species/tree nodes to remove/ignore from analysis, one per line. First column is nodeID (e.g. 400010010), second column is the node/species name, third column whether this is "species" or "node". Remember this must include also internal nodes. Default = "" (off)') | |
parser.add_argument('--dumpFamilies', metavar='dumpFamilies', default = "", type=str, help='Option to create output file with the post-filtered Compara families used here. Introduce wanted file name after argument. Default = No') | |
parser.add_argument('--flagFamilyList', metavar='flagFamilyList', default = "", type=str, help='Option to flag with "exclude" specific families provided in a flat file, with one family ID per line. Default = No') | |
parser.add_argument('--geneFilterList', metavar='geneFilterList', default = "", type=str, help='Option to exclude the listed genes from all output measures. Use mRNA IDs (SEQMEMBER_ID), which may be different from families file. Default = No') | |
parser.add_argument('--loadDumps', metavar='loadDumps', default = "", type=str, help='Instead of running the script from scratch, load already dumped python objects. Provide dumps folder path. Default = No') | |
parser.add_argument('--dumpsFolder', metavar='dumpsFolder', default = "dumps", type=str, help='Give folder path to create object dumps. Default = create dumps in folder called "dumps".') | |
args = parser.parse_args() #gets the arguments | |
print (args) | |
#any better way to pass arguments? possible with *args or **kwargs? couldn't make it work using eval | |
run = Run(args.comparaFamilies,args.familiesPerTaxonomyLevel,args.geneDuplicationsFolder,args.geneLossesFolder,args.speciesNameTag,args.geneIDMappingFolder,\ | |
args.interproscanGFFFolder,args.branchLenghts,args.removeOutgroups,args.speciesTree,args.verbose,args.outputFile,args.dumpFamilies,args.flagFamilyList,\ | |
args.speciesClade,args.geneFilterList,args.dumpsFolder,args.nodeInfo,args.proteinLengthsFolder) | |
if args.loadDumps != "": | |
#to speed up processing, just load data and run stats | |
run.runLoad(args.loadDumps) | |
else: | |
#create dump folder, write parameter file and run all | |
if not os.path.exists(args.dumpsFolder): | |
os.mkdir(args.dumpsFolder) | |
with open(args.dumpsFolder+"/parameters.txt","w") as parametersFile: | |
parametersFile.write(str(args) ) | |
run.run() | |
print ("Finished in %.1f seconds." % (clock()-time0)) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment