Skip to content

Instantly share code, notes, and snippets.

@mtholder
Created August 26, 2012 04:25
Show Gist options
  • Save mtholder/3474043 to your computer and use it in GitHub Desktop.
Save mtholder/3474043 to your computer and use it in GitHub Desktop.
Reads a phylogenetic tree or data file and prints out the taxa labels one per line
// Copyright (C) 2012 Mark T. Holder
//
// Based on example/splitsinfile/splitsinfile.cpp in NEXUS class library
//
// After NCL has been installed at ${NCL_PREFIX}
//
// g++ newick2taxalist.cpp -o newick2taxalist -I "${NCL_PREFIX}/include" -lncl -L "${NCL_PREFIX}/lib/ncl"
//
// Takes a newick file and prints out the taxa labels, one per line.
//
// This script is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This script is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with NCL; if not, write to the Free Software Foundation, Inc.,
// 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
#include "ncl/ncl.h"
#include "ncl/nxsblock.h"
#include "ncl/nxspublicblocks.h"
#include "ncl/nxsmultiformat.h"
#include <cassert>
using namespace std;
long gStrictLevel = 2;
long gInterleaveLen = -1;
bool gVerbose = false;
void processContent(PublicNexusReader & nexusReader, ostream *out);
////////////////////////////////////////////////////////////////////////////////
// Takes NxsReader that has successfully read a file, and processes the
// information stored in the reader.
//
// The caller is responsibel for calling DeleteBlocksFromFactories() to clean
// up (if the reader uses the factory API).
////////////////////////////////////////////////////////////////////////////////
void processContent(PublicNexusReader & nexusReader, ostream *out)
{
if (!out)
return;
BlockReaderList blocks = nexusReader.GetUsedBlocksInOrder();
for (BlockReaderList::const_iterator bIt = blocks.begin(); bIt != blocks.end(); ++bIt)
{
NxsBlock * b = *bIt;
if (b->GetID() == "TAXA")
{
NxsTaxaBlockAPI * tb = (NxsTaxaBlockAPI *)b;
std::vector<std::string> labVec = tb->GetAllLabels();
std::vector<std::string>::const_iterator vIt = labVec.begin();
for (; vIt != labVec.end(); ++vIt)
{
*out << *vIt << '\n';
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Creates a NxsReader, and tries to read the file `filename`. If the
// read succeeds, then processContent will be called.
////////////////////////////////////////////////////////////////////////////////
void processFilepath(
const char * filename, // name of the file to be read
ostream *out, // output stream to use (NULL for no output). Not that cerr is used to report errors.
MultiFormatReader::DataFormatType fmt) // enum indicating the file format to expect.
{
assert(filename);
try
{
MultiFormatReader nexusReader(-1, NxsReader::WARNINGS_TO_STDERR);
if (gStrictLevel != 2)
nexusReader.SetWarningToErrorThreshold((int)NxsReader::FATAL_WARNING + 1 - (int) gStrictLevel);
nexusReader.SetWarningOutputLevel(NxsReader::SUPPRESS_WARNINGS_LEVEL);
NxsCharactersBlock * charsB = nexusReader.GetCharactersBlockTemplate();
NxsDataBlock * dataB = nexusReader.GetDataBlockTemplate();
charsB->SetAllowAugmentingOfSequenceSymbols(true);
dataB->SetAllowAugmentingOfSequenceSymbols(true);
if (gInterleaveLen > 0)
{
assert(charsB);
charsB->SetWriteInterleaveLen(gInterleaveLen);
dataB->SetWriteInterleaveLen(gInterleaveLen);
}
NxsTreesBlock * treesB = nexusReader.GetTreesBlockTemplate();
assert(treesB);
if (gStrictLevel < 2)
treesB->SetAllowImplicitNames(true);
if (gStrictLevel < 2)
{
NxsStoreTokensBlockReader *storerB = nexusReader.GetUnknownBlockTemplate();
assert(storerB);
storerB->SetTolerateEOFInBlock(true);
}
try {
nexusReader.ReadFilepath(filename, fmt);
processContent(nexusReader, out);
}
catch(...)
{
nexusReader.DeleteBlocksFromFactories();
throw;
}
nexusReader.DeleteBlocksFromFactories();
}
catch (const NxsException &x)
{
cerr << "Error:\n " << x.msg << endl;
if (x.line >=0)
cerr << "at line " << x.line << ", column (approximately) " << x.col << " (and file position "<< x.pos << ")" << endl;
exit(2);
}
}
void readFilepathAsNEXUS(const char *filename, MultiFormatReader::DataFormatType fmt)
{
cerr << "[Reading " << filename << " ]" << endl;
try {
ostream * outStream = 0L;
outStream = &cout;
processFilepath(filename, outStream, fmt);
}
catch (...)
{
cerr << "Parsing of " << filename << " failed (with an exception)" << endl;
exit(1);
}
}
void readFilesListedIsFile(const char *masterFilepath, MultiFormatReader::DataFormatType fmt)
{
ifstream masterStream(masterFilepath, ios::binary);
if (masterStream.bad())
{
cerr << "Could not open " << masterFilepath << "." << endl;
exit(3);
}
char filename[1024];
while ((!masterStream.eof()) && masterStream.good())
{
masterStream.getline(filename, 1024);
if (strlen(filename) > 0 && filename[0] != '#')
readFilepathAsNEXUS(filename, fmt);
}
}
void printHelp(ostream & out)
{
out << "newick2taxalist takes a file and prints out the taxa labels, one per line.\n";
out << "\nThe most common usage is simply:\n newick2taxalist <path to NEXUS file>\n";
out << "\nCommand-line flags:\n\n";
out << " -h on the command line shows this help message\n\n";
out << " -v verbose output\n\n";
out << " -l<path> reads a file and treats each line of the file as a path to NEXUS file\n\n";
out << " -s<non-negative integer> controls the NEXUS strictness level.\n";
out << " the default level is equivalent to -s2 invoking the program with \n";
out << " -s3 or a higher number will convert some warnings into fatal errors.\n";
out << " Running with -s1 will cause the parser to accept dangerous constructs,\n";
out << " and running with -s0 will cause the parser make every attempt to finish\n";
out << " parsing the file (warning about very serious errors).\n\n";
out << " Note that when -s0 strictness level is used, and the parser fails to\n";
out << " finish, it will often be the result of an earlier error than the \n";
out << " error that is reported in the last message.\n";
out << " -f<format> specifies the input file format expected:\n";
out << " -fnexus NEXUS (this is also the default)\n";
out << " -faafasta Amino acid data in fasta\n";
out << " -fdnafasta DNA data in fasta\n";
out << " -frnafasta RNA data in fasta\n";
out << " The complete list of format names that can follow the -f flag is:\n";
std::vector<std::string> fmtNames = MultiFormatReader::getFormatNames();
for (std::vector<std::string>::const_iterator n = fmtNames.begin(); n != fmtNames.end(); ++n)
{
out << " "<< *n << "\n";
}
}
int main(int argc, char *argv[])
{
MultiFormatReader::DataFormatType f(MultiFormatReader::NEXUS_FORMAT);
bool readfile = false;
for (int i = 1; i < argc; ++i)
{
const char * filepath = argv[i];
const unsigned slen = strlen(filepath);
if (strlen(filepath) > 1 && filepath[0] == '-' && filepath[1] == 'h')
printHelp(cout);
else if (strlen(filepath) == 2 && filepath[0] == '-' && filepath[1] == 'v')
gVerbose = true;
else if (slen > 1 && filepath[0] == '-' && filepath[1] == 's')
{
if ((slen == 2) || (!NxsString::to_long(filepath + 2, &gStrictLevel)))
{
cerr << "Expecting an integer after -s\n" << endl;
printHelp(cerr);
return 2;
}
}
else if (slen > 1 && filepath[0] == '-' && filepath[1] == 'f')
{
f = MultiFormatReader::UNSUPPORTED_FORMAT;
if (slen > 2)
{
std::string fmtName(filepath + 2, slen - 2);
f = MultiFormatReader::formatNameToCode(fmtName);
}
if (f == MultiFormatReader::UNSUPPORTED_FORMAT)
{
cerr << "Expecting a format after after -f\n" << endl;
printHelp(cerr);
return 2;
}
}
else
{
if (strlen(filepath) > 2 && filepath[0] == '-' && filepath[1] == 'l')
{
readfile = true;
readFilesListedIsFile(filepath+2, f);
}
else
{
readfile = true;
readFilepathAsNEXUS(filepath, f);
}
}
}
if (!readfile)
{
cerr << "Expecting the path to NEXUS file as the only command line argument!\n" << endl;
printHelp(cerr);
return 1;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment