Skip to content

Instantly share code, notes, and snippets.

@spond
Created August 21, 2015 19:50
Show Gist options
  • Save spond/32cd64d7d811139d3499 to your computer and use it in GitHub Desktop.
Save spond/32cd64d7d811139d3499 to your computer and use it in GitHub Desktop.
Pairwise dN/dS estimation
/*
THIS HyPhy BATCH LANGUAGE FILE IS PROVIDED AS AN EXAMPLE/TOOL FOR
THE 2nd EDITION OF 'The Phylogenetic Handbook'
Written by
Sergei L Kosakovsky Pond ([email protected])
Art FY Poon ([email protected])
Simon DW Frost ([email protected])
*/
RequireVersion ("0.9920060830");
VERBOSITY_LEVEL = -1;
/*---------------------------------------------------------------------------------------------------------------------------------------------*/
fprintf (stdout, "\n\nThis example analysis fits the local MG94xREV model \n",
"(separate alpha and beta on each branch) to a codon dataset,\n",
"and computes several distance measures between sequences:\n",
" - expected number of substitutions per codon\n",
" - expected number of syn. substitutions per codon\n",
" - expected number of nonsyn. substitutions per codon\n",
" - dS\n",
" - dN\n",
"and provides the example contrasting the estimation of dN and dS\n",
"between a pair of sequences based on the phylogenetic tree and\n",
"direct pairwise estimation.\n");
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"DescriptiveStatistics.bf");
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def");
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"dSdNTreeTools.ibf");
ChoiceList (freqs, "Alignment", 1, SKIP_NONE, "Flu H5N1 HA", "Use the example Influenza H5N1 hemagglutinin alignment (5 sequences)",
"Drospophila adh", "Use the example Drosophila ADH alignment (6 sequences).",
"Custom", "Load your own alignment and tree.");
if (freqs < 0)
{
return 0;
}
if (freqs == 2)
{
SetDialogPrompt ("Choose a nucleotide alignment");
DataSet ds = ReadDataFile (PROMPT_FOR_FILE);
}
else
{
if (freqs == 1)
{
GetURL (dataString, "http://www.hyphy.org/phylohandbook/data/Drosophila_adh.nex");
}
else
{
GetURL (dataString, "http://www.hyphy.org/phylohandbook/data/H5N1_HA_5.nex");
}
DataSet ds = ReadFromString (dataString);
}
DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions);
SKIP_MODEL_PARAMETER_LIST = 1;
modelDesc = "012345";
modelType = 0;
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"MG94custom.mdl");
SKIP_MODEL_PARAMETER_LIST = 0;
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"queryTree.bf");
LikelihoodFunction theLnLik = (filteredData, givenTree);
fprintf (stdout, "Fitting local MG94xREV to observed data... \n");
Optimize (res, theLnLik);
nonStopCount = ModelMatrixDimension;
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Distances"+DIRECTORY_SEPARATOR+"CodonTools.def");
sSites = 0;
nsSites = 0;
synM = {nonStopCount,nonStopCount};
nonSynM = {nonStopCount,nonStopCount};
vertOnes = {nonStopCount,1};
horOnes = {1,nonStopCount};
for (h1 = 0; h1<nonStopCount; h1=h1+1)
{
vertOnes [h1] = 1;
horOnes [h1] = 1;
}
hShift = 0;
for (h1 = 0; h1 < 64; h1=h1+1)
{
gc1 = _Genetic_Code[h1];
if (gc1 == 10)
{
hShift = hShift+1;
}
else
{
sSites = sSites + filteredData.sites * _S_NS_POSITIONS_[0][h1] * vectorOfFrequencies[h1-hShift];
nsSites = nsSites + filteredData.sites * _S_NS_POSITIONS_[1][h1] * vectorOfFrequencies[h1-hShift];
vShift = hShift;
for (v1 = h1+1; v1 < 64; v1=v1+1)
{
gc2 = _Genetic_Code[v1];
if (gc2 == 10)
{
vShift = vShift + 1;
}
else
{
if (gc1 == gc2)
{
synM [h1-hShift][v1-vShift] = vectorOfFrequencies[h1-hShift];
synM [v1-vShift][h1-hShift] = vectorOfFrequencies[v1-vShift];
}
else
{
nonSynM [h1-hShift][v1-vShift] = vectorOfFrequencies[h1-hShift];
nonSynM [v1-vShift][h1-hShift] = vectorOfFrequencies[v1-vShift];
}
}
}
}
}
synSubsAVL = {};
dSAVL = {};
nsSubsAVL = {};
dNAVL = {};
ExecuteAFile (HYPHY_LIB_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TreeTools.ibf");
fprintf (stdout, "\nTotal nucleotide sites :", filteredData.sites*3,
"\nSynonymous sites :", sSites,
"\nNonsynonymous sites :", nsSites, "\n");
sSites = 3*filteredData.sites/sSites;
nsSites = 3*filteredData.sites/nsSites;
branchNames = BranchName (givenTree,-1);
total = Columns(branchNames)-1;
branchReport = {total,6};
labelString = "";
totalLength = {4,1};
for (h1=0; h1 < total; h1=h1+1)
{
abn = branchNames[h1];
ExecuteCommands("GetInformation (aRateMx, givenTree."+abn+");");
synSubs = (horOnes*(aRateMx$synM))*vertOnes;
nsynSubs = (horOnes*(aRateMx$nonSynM))*vertOnes;
synSubs = synSubs[0]/3;
nsynSubs = nsynSubs[0]/3;
synSubsAVL[abn] = synSubs;
nsSubsAVL [abn] = nsynSubs;
dSAVL[abn] = synSubs *sSites;
dNAVL[abn] = nsynSubs*nsSites;
ExecuteCommands("localOmega = givenTree."+abn+".nonSynRate/givenTree."+abn+".synRate;");
branchReport[h1][0] = synSubs;
branchReport[h1][1] = nsynSubs;
branchReport[h1][2] = dSAVL[abn];
branchReport[h1][3] = dNAVL[abn];
branchReport[h1][4] = dNAVL[abn]/dSAVL[abn];
branchReport[h1][5] = localOmega;
for (h2 = 0; h2<4; h2=h2+1)
{
totalLength[h2] = totalLength[h2] + branchReport[h1][h2];
}
labelString = labelString+";"+abn;
}
fprintf (stdout, "\nTotal tree lengths:",
"\n\tE[syn] : ", totalLength[0],
"\n\tE[non-syn] : ", totalLength[1],
"\n\tdS : ", totalLength[2],
"\n\tdN : ", totalLength[3], "\n");
columnHeaders = {{"E[syn]","E[nonsyn]","dS","dN","dN/dS","omega=beta/alpha",labelString}};
OpenWindow (CHARTWINDOW,{{"Branch Report"}
{"columnHeaders"}
{"branchReport"}
{"Scatterplot"}
{"dS"}
{"dN"}
{"dS"}
{""}
{"dN"}
{"0"}
{""}
{"-1;-1"}
{"10;1.309;0.785398"}
{"Times:12:0;Times:10:0;Times:12:2"}
{"0;0;16777215;11776947;0;0;6579300;11842740;13158600;14474460;0;3947580;16777215;15670812;6845928;16771158;2984993;9199669;7018159;1460610;16748822;11184810;14173291"}
{"16,0,0"}
},
"(SCREEN_WIDTH-50)/2;(SCREEN_HEIGHT-50)/2;30+(SCREEN_WIDTH-30)/2;45");
treeAVL = givenTree ^ 0;
synTreeString = PostOrderAVL2StringDistances (treeAVL, synSubsAVL);
nonSynTreeString = PostOrderAVL2StringDistances (treeAVL, nsSubsAVL);
dSTreeString = PostOrderAVL2StringDistances (treeAVL, dSAVL);
dNTreeString = PostOrderAVL2StringDistances (treeAVL, dNAVL);
fprintf (stdout, "\n\nE[Syn subs/nucleotide site] tree: \n\t", synTreeString, "\n");
fprintf (stdout, "\nE[Non-syn subs/nucleotide site] tree: \n\t", nonSynTreeString, "\n");
fprintf (stdout, "\ndS tree: \n\t", dSTreeString, "\n");
fprintf (stdout, "\ndN tree: \n\t", dNTreeString, "\n");
UseModel (USE_NO_MODEL);
Tree synSubsTree = synTreeString;
Tree nonsynSubsTree = nonSynTreeString;
Tree dSTree = dSTreeString;
Tree dNTree = dNTreeString;
mxTreeSpec = {5,1};
mxTreeSpec [0] = "nonsynSubsTree";
mxTreeSpec [3] = "";
mxTreeSpec [4] = "Inferred_Tree."+nodeName;
mxTreeSpec [1] = "8211";
mxTreeSpec [2] = "";
OpenWindow (TREEWINDOW, mxTreeSpec,"(SCREEN_WIDTH-50)/4;(SCREEN_HEIGHT-50)/4;10;40");
mxTreeSpec [0] = "synSubsTree";
OpenWindow (TREEWINDOW, mxTreeSpec,"(SCREEN_WIDTH-50)/4;(SCREEN_HEIGHT-50)/4;30+(SCREEN_WIDTH-30)/4;40");
mxTreeSpec [0] = "dSTree";
OpenWindow (TREEWINDOW, mxTreeSpec,"(SCREEN_WIDTH-50)/4;(SCREEN_HEIGHT-50)/4;10;45+(SCREEN_HEIGHT-50)/4");
mxTreeSpec [0] = "dNTree";
OpenWindow (TREEWINDOW, mxTreeSpec,"(SCREEN_WIDTH-50)/4;(SCREEN_HEIGHT-50)/4;30+(SCREEN_WIDTH-30)/4;45+(SCREEN_HEIGHT-50)/4");
/* start: constrain global parameters for pairwise estimates */
GetString (likelihoodInfo, theLnLik, -1);
globalVariableList = likelihoodInfo["Global Independent"];
globalVariableCount = Columns (globalVariableList);
if (globalVariableCount)
{
stashed_GV = {globalVariableCount,1};
for (vc = 0; vc < globalVariableCount; vc = vc + 1)
{
ExecuteCommands (globalVariableList[vc]+":="+globalVariableList[vc]+"__;");
}
}
/* end: constrain global parameters for pairwise estimates */
taxa = {{0,0}};
theStencil = ComputeScalingStencils (0);
tc = filteredData.species*(filteredData.species-1)/2;
fprintf (stdout, "\nEstimating pairwise dN/dS: total of ", tc, " analyses\n");
branchReport2 = {tc,4};
labelString = "";
tc = 0;
for (tax1 = 0; tax1 < filteredData.species; tax1=tax1+1)
{
for (tax2 = tax1+1; tax2 < filteredData.species; tax2=tax2+1)
{
GetString (tx1, filteredData, tax1);
GetString (tx2, filteredData, tax2);
BRANCH_LENGTH_STENCIL = theStencil["Syn"];
dS = BranchLength(givenTree, tx1+";"+tx2);
BRANCH_LENGTH_STENCIL = theStencil["NonSyn"];
dN = BranchLength(givenTree, tx1+";"+tx2);
BRANCH_LENGTH_STENCIL = 0;
labelString = labelString + ";" + tx1 + "," + tx2;
branchReport2 [tc][0] = dS*sSites;
branchReport2 [tc][1] = dN*nsSites;
DataSetFilter twoSeqFilter = CreateFilter (ds,3,"",speciesIndex == tax1 || speciesIndex == tax2,GeneticCodeExclusions);
twoTreeString = "("+tx1+","+tx2+")";
UseModel (MG94customModel);
Tree t2 = twoTreeString;
LikelihoodFunction twoLnLik = (twoSeqFilter, t2);
Optimize (res2, twoLnLik);
v1 = ReturnVectorsOfCodonLengths (theStencil,"t2");
branchReport2 [tc][2] = (scaledVectors["Syn"])[0]*sSites;
branchReport2 [tc][3] = (scaledVectors["NonSyn"])[0]*nsSites;
tc = tc + 1;
}
}
VERBOSITY_LEVEL = 0;
columnHeaders = {{"Tree based dS","Tree based dN","Pairwise dS","Pairwise dN",labelString}};
OpenWindow (CHARTWINDOW,{{"Pairwise Report"}
{"columnHeaders"}
{"branchReport2"}
{"Scatterplot"}
{"Tree based dN"}
{"Pairwise dN"}
{"Tree based dN"}
{""}
{"Pairwise dN"}
{"0"}
{""}
{"-1;-1"}
{"10;1.309;0.785398"}
{"Times:12:0;Times:10:0;Times:12:2"}
{"0;0;16777215;11776947;0;0;6579300;11842740;13158600;14474460;0;3947580;16777215;15670812;6845928;16771158;2984993;9199669;7018159;1460610;16748822;11184810;14173291"}
{"16,0,0"}
},
"(SCREEN_WIDTH-50)/2;(SCREEN_HEIGHT-50)/2;30+(SCREEN_WIDTH-30)/2;45+(SCREEN_HEIGHT-50)/2");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment