Last active
May 23, 2020 05:47
-
-
Save Krailon/3ae4f43460292910b3aa to your computer and use it in GitHub Desktop.
Modified version of Lee's script to convert multiline FASTA to single-line. A "coverage threshold" option has been added to allow discarding of low-coverage sequences.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Created by: Lee Bergstrand | |
# Modified by: Matt McInnes | |
# Descript: Converts multiline FASTAs to single line FASTAs | |
# | |
# Usage: FastaMLtoSL.py <sequences.faa> | |
# Example: FastaMLtoSL.py mySeqs.faa | |
#---------------------------------------------------------------------------------------- | |
#=========================================================================================================== | |
#Imports: | |
import sys | |
import re | |
#=========================================================================================================== | |
# Functions: | |
# 1: Checks if in proper number of arguments are passed gives instructions on proper use. | |
def argsCheck(numArgs): | |
if len(sys.argv) != numArgs: | |
print("Converts multiline FASTAs to single line FASTAs") | |
print("By Lee Bergstrand\n") | |
print("Usage: " + sys.argv[0] + " <sequences.fasta> <coverage threshold>") | |
print("Examples: " + sys.argv[0] + " mySeqs.fasta 150") | |
exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
#=========================================================================================================== | |
# Main program code: | |
# House keeping... | |
argsCheck(3) # Checks if the number of arguments are correct. | |
try: | |
# Stores file one for input checking. | |
inFile = sys.argv[1] | |
outFile = inFile + ".out" | |
coverage_threshold = float(sys.argv[2]) | |
print(">> Using coverage threshold of %f" % coverage_threshold) | |
# Reads sequence file list and stores it as a string object. Safely closes file: | |
print(">> Opening FASTA file...") | |
with open(inFile, "r") as fasta_file: | |
fasta_data = fasta_file.read() | |
sequence_blocks = re.findall(">[^>]+", fasta_data) | |
except IOError: | |
print("Error: failed to open " + inFile) | |
exit(1) | |
print(">> Converting FASTA file from multiline to single line and writing to file.") | |
# Conversts multiline fasta to single line. Writes new fasta to file. | |
try: | |
with open(outFile, "w") as newFasta: | |
for block in sequence_blocks: | |
try: | |
header, sequence = block.split("\n", 1) # Split each fasta into header and sequence. | |
except ValueError: | |
print("Error: failed to split header and sequence") | |
header = header + "\n" # Replace ">" lost in ">" split, Replace "\n" lost in split directly above. | |
sequence = sequence.replace("\n", "") + "\n" # Replace newlines in sequence, remember to add one to the end. | |
match = re.search("node_(\d+)_length_\d+_cov_([\d\.]+)_id_(\d+)", header, re.IGNORECASE) | |
if not match: | |
print("Warning: failed to extract header values from sequence:\n%s" % sequence) | |
continue | |
sequence_node = int(match.groups()[0]) | |
sequence_coverage = float(match.groups()[1]) | |
sequence_id = int(match.groups()[2]) | |
if sequence_coverage < coverage_threshold: | |
print(">> Skipping sequence {NODE: %d, id: %d} with coverage %d" % (sequence_node, sequence_id, sequence_coverage)) | |
continue | |
newFasta.write(header + sequence) | |
except IOError: | |
print("Error: failed to open " + inFile) | |
exit(1) | |
newFasta.close() | |
print(">> Done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment