PhylogeneticTree with FastOMA
Updating codes frm the F1000 paper on PhylogeneticTree on github for FastOMA
| #!/usr/bin/env python | |
| import logging | |
| import sys | |
| import argparse | |
| from Bio import AlignIO | |
| from Bio.Align import MultipleSeqAlignment | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from collections import defaultdict | |
| # slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py | |
| logger = logging.getLogger("concat") | |
| def load_alignments(alignmentfiles, format): | |
| alignments = [] | |
| for file in alignmentfiles: | |
| try: | |
| for alignment in AlignIO.parse(file, format=format): | |
| logger.debug("loaded alignment of length {} from {}".format(len(alignment), file)) | |
| alignments.append(alignment) | |
| except ValueError as e: | |
| logger.error("Cannot parse input file {}: {}".format(file, e)) | |
| raise | |
| logger.info("Successfully loaded {} alignments from {} input files" | |
| .format(len(alignments), len(alignmentfiles))) | |
| return alignments | |
| def concatenate(alignments): | |
| # Get the full set of labels (i.e. sequence ids) for all the alignments | |
| all_labels = set(seq.id for aln in alignments for seq in aln) | |
| logger.debug("extracted {} different labels in all alignments: {}" | |
| .format(len(all_labels), all_labels)) | |
| # Make a dictionary to store info as we go along | |
| # (defaultdict is convenient -- asking for a missing key gives back an empty list) | |
| concat_buf = defaultdict(list) | |
| for aln in alignments: | |
| length = aln.get_alignment_length() | |
| # check if any labels are missing in the current alignment | |
| these_labels = set(rec.id for rec in aln) | |
| missing = all_labels - these_labels | |
| logger.debug("alignment of length {} with {} sequences, {} missing ({})" | |
| .format(length, len(these_labels), len(missing), missing)) | |
| # if any are missing, create unknown data of the right length, | |
| # stuff the string representation into the concat_buf dict | |
| for label in missing: | |
| new_seq = 'X'*length # UnknownSeq(length, character='X') | |
| concat_buf[label].append(str(new_seq)) | |
| # else stuff the string representation into the concat_buf dict | |
| for rec in aln: | |
| concat_buf[rec.id].append(str(rec.seq)) | |
| # Stitch all the substrings together using join (most efficient way), | |
| # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment | |
| msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr)), id=label) | |
| for (label, seq_arr) in concat_buf.items()) | |
| logger.info("concatenated MSA of {} taxa and total length {} created" | |
| .format(len(msa), len(msa[0]))) | |
| return msa | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Concatenate alignments", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| parser.add_argument('-f', '--format-in', default='fasta', | |
| help="input format of the alignments. Any format that is understood" | |
| "by Biopython's AlignIO module is possible.") | |
| parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, | |
| help="Path to the output file where the concatenated multiple " | |
| "sequence alignment will be written") | |
| parser.add_argument('-u', '--format-output', default='phylip-relaxed', | |
| help="output format of the concatenated multiple sequence alignment") | |
| parser.add_argument('-v', '--verbose', action='store_true', default=False, | |
| help="Produce some output and status reports") | |
| parser.add_argument('-d', '--debug', action="store_true", default=False, | |
| help="Be more verbose for debugging purposes") | |
| parser.add_argument('alignment', nargs='+', type=str, | |
| help="Path to the alignment files. Use shell expansion to pass many files " | |
| "in a simple way, e.g. \"/path/to/folder/*.fa\".") | |
| conf = parser.parse_args() | |
| level = logging.WARNING | |
| if conf.verbose: | |
| level = logging.INFO | |
| if conf.debug: | |
| level = logging.DEBUG | |
| logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| logger.debug("Concatenate alignments: arguments: {}".format(conf)) | |
| alignments = load_alignments(conf.alignment, conf.format_in.lower()) | |
| msa = concatenate(alignments) | |
| AlignIO.write(msa, conf.output, conf.format_output.lower()) |
| #!/usr/bin/env python | |
| # slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py | |
| import math | |
| import multiprocessing | |
| import argparse | |
| import os | |
| import Bio.SeqIO | |
| import re | |
| import logging | |
| logger = logging.getLogger('filter-group') | |
| RE_SPECIES = re.compile("\[(?P<species>[^]]*)\].*") | |
| def get_species_from_header(header): | |
| #print(header, header.split("||")) | |
| m = header.split("||")[1] #RE_SPECIES.search(header) | |
| return m | |
| #if m is not None: | |
| # return m.group('species') | |
| #raise InvalidHeaderFormat(header) | |
| def convert_fasta_file(infile, outfile, min_nr_species): | |
| seen = set([]) | |
| records = [] #[convert_id_to_species_name(z) for z in Bio.SeqIO.parse(infile, 'fasta')] | |
| for record in Bio.SeqIO.parse(infile, 'fasta'): | |
| record.id = get_species_from_header(record.description) | |
| if record.id not in seen: | |
| records.append(record) | |
| seen.add(record.id) | |
| if len(records) >= min_nr_species: | |
| cnts = Bio.SeqIO.write(records, outfile, 'fasta') | |
| logger.debug("\"{}\" contains {} sequences. IDs have been converted and file rewritten to {}" | |
| .format(infile, cnts, outfile)) | |
| else: | |
| logger.debug("skipping \"{}\" with {} sequences".format(infile, len(records))) | |
| cnts = 0 | |
| return cnts | |
| def get_matching_files(input_dir, pattern): | |
| with os.scandir(input_dir) as dir_iter: | |
| for f in dir_iter: | |
| if f.name.endswith(pattern): | |
| yield f | |
| def convert_files(input_dir, output_dir, min_nr_species, pattern='.fa', nr_processes=None): | |
| os.makedirs(output_dir, exist_ok=True) | |
| args_iter = ((x.path, os.path.join(output_dir, x.name), min_nr_species) for x in get_matching_files(input_dir, pattern)) | |
| with multiprocessing.Pool(processes=nr_processes) as mp: | |
| logger.info("converting files in {} with {} processes in parallel" | |
| .format(input_dir, nr_processes if nr_processes else os.cpu_count())) | |
| result = mp.starmap(convert_fasta_file, args_iter) | |
| converted = [x for x in result if x > 0] | |
| logger.info("{} (of {}) files filtered and converted. Converted files " | |
| "contain {:.2f} species on average (sd {:.4f})" | |
| .format(len(converted), len(result), sum(converted)/len(converted), | |
| math.sqrt((sum(x**2 for x in converted) - (sum(converted)**2)/len(converted))/(len(converted)-1)) | |
| )) | |
| class InvalidHeaderFormat(Exception): | |
| pass | |
| class SeveralSequencePerSpeciesException(Exception): | |
| pass | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser("Filter marker genes based on how complete they cover the " | |
| "species set") | |
| parser.add_argument('-n', '--min-nr-species', type=int, required=True, | |
| help="Specify the minimum number of species that should be " | |
| "covered by each marker gene.") | |
| parser.add_argument('-i', '--input', type=str, required=True, | |
| help='Path to a directory containing the input files to be filtered') | |
| parser.add_argument('-o', '--output', required=True, | |
| help="Path to the output directory where the filtered fasta files will " | |
| "be stored. This directory does not need to exist. If it does, existing " | |
| "files will not be removed, but might be overwritten.") | |
| parser.add_argument('-e', '--ext', type=str, default=".fa", | |
| help="Extension of files to be considered for conversion. (default %(default)s)") | |
| parser.add_argument('-#', '--threads', type=int, | |
| help="Nr of threads to use to filter input files in parallel. " | |
| "(defaults to the number of available cores)") | |
| parser.add_argument('-v', '--verbose', action='store_true', default=False, | |
| help="Produce some output and status reports") | |
| parser.add_argument('-d', '--debug', action="store_true", default=False, | |
| help="Be more verbose for debugging purposes") | |
| conf = parser.parse_args() | |
| level = logging.WARNING | |
| if conf.verbose: | |
| level = logging.INFO | |
| if conf.debug: | |
| level = logging.DEBUG | |
| logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| logger.debug("Concatenate alignments: arguments: {}".format(conf)) | |
| convert_files(conf.input, conf.output, conf.min_nr_species, pattern=conf.ext, nr_processes=conf.threads) |