-
-
Save jrjhealey/dda98e024a53e2f2c2c32b15ceb6bfd3 to your computer and use it in GitHub Desktop.
# Extract fasta files by their descriptors stored in a separate file. | |
# Requires biopython | |
from Bio import SeqIO | |
import sys | |
import argparse | |
def getKeys(args): | |
"""Turns the input key file into a list. May be memory intensive.""" | |
with open(args.keys, "r") as kfh: | |
keys = [] | |
for line in kfh: | |
line = line.rstrip('\n') | |
line = line.lstrip('>') | |
keys.append(line) | |
return keys | |
def main(): | |
"""Takes a string or list of strings in a text file (one per line) and retreives them and their sequences from a provided multifasta.""" | |
# Parse arguments from the commandline: | |
try: | |
parser = argparse.ArgumentParser(description='Retrieve one or more fastas from a given multifasta.') | |
parser.add_argument( | |
'-f', | |
'--fasta', | |
action='store', | |
required=True, | |
help='The multifasta to search.') | |
parser.add_argument( | |
'-k', | |
'--keys', | |
action='store', | |
required=True, | |
help='A string provided directly, or a file of header strings to search the multifasta for. Must be exact. Must be one per line.') | |
parser.add_argument( | |
'-o', | |
'--outfile', | |
action='store', | |
default=None, | |
help='Output file to store the new fasta sequences in. Just prints to screen by default.') | |
parser.add_argument( | |
'-v', | |
'--verbose', | |
action='store_true', | |
help='Set whether to print the key list out before the fasta sequences. Useful for debugging.') | |
parser.add_argument( | |
'-i', | |
'--invert', | |
action='store_true', | |
help='Invert the search, and retrieve all sequences NOT specified in the keyfile.') | |
args = parser.parse_args() | |
except: | |
print('An exception occured with argument parsing. Check your provided options.') | |
sys.exit(1) | |
# Main code: | |
# Call getKeys() to create the list of keys from the provided file: | |
try: | |
keys = getKeys(args.keys) | |
# If -k/--keys was provided with a string, not a file path, the IO error is used as the indicator | |
# to switch to expecting a string only, rather than a file. | |
except IOError: | |
keys = args.keys | |
else: | |
print("Couldn't determine a key from your provided file or string. Double check your file, or ensure your string is quoted correctly.") | |
if args.verbose is not False: | |
if args.invert is False: | |
print('Fetching the following keys from: ' + inFile) | |
for key in keys: | |
print(key) | |
else: | |
print('Ignoring the following keys, and retreiving everything else from: ' + inFile) | |
for key in keys: | |
print(key) | |
# Parse in the multifasta and assign an iterable variable: | |
seqIter = SeqIO.parse(inFile, 'fasta') | |
# For each sequence in the multifasta, check if it's in the keys[] tuple. If so, print it out: | |
for seq in seqIter: | |
if args.invert is False: | |
if seq.id in keys: | |
print(seq.format("fasta")) | |
if args.outfile is not None: | |
SeqIO.write(seq, outFile, "fasta") | |
else: | |
# If the --invert/-i flag is given, print all fastas NOT listed in the keyfile | |
if seq.id not in keys: | |
print(seq.format("fasta")) | |
if args.outfile is not None: | |
SeqIO.write(seq, outFile, "fasta") | |
if __name__ == "__main__": | |
main() |
Hi Vasu,
This code is a little old, so try the newer version available here: https://github.com/jrjhealey/bioinfo-tools/blob/master/fastafetcher.py
If you run the code as python fastafetcher.py --help
, it will tell you exactly what's needed to run it. You will need the biopython package installed for it to work.
Hi,
I did copy the code from https://github.com/jrjhealey/bioinfo-tools/blob/master/fastafetcher.py. I tried running it a suggested in the help file. there are many errors that pop up when I try and run it.
For example:
The command I used : python fastafetcher.py -f OG0000009.fa [-s "|OG0010404_batch_1|"] [-m {exact,partial}]
The error:
usage: fastafetcher.py [-h] -f FASTA [-k KEYFILE] [-s STRING] [-o OUTFILE] [-v] [-i] [-m {exact,partial}]
fastafetcher.py: error: unrecognized arguments: [-s |OG0010404_batch_1|] [-m exact] partial]
Here, OG0000009.fa is the fasta file to search in and "|OG0010404_batch_1|"] is my key word. I need to search in partial mode.
Please let me now if you can help me.
Thanks in advance,
Sanchita
You don't need to use the [ ] - these are convention to denote which arguments are optional.
You would simple run something like: fastafetcher.py -f /path/to/fasta.file -s "OG0010404_batch_1"
Depending on how the keys in your file are structured you may need the |
characters you have, or you may need to use the -m partial
or -m exact
option too.
Hi again,
I realized that just after I wrote here. It works fine now.
Thank you.
Hi,
I am trying to use this code on my multi-fasta file. Can you please elaborate on how exactly to use this script. What could be the input? and in what format. If possible can you provide an example? I am new in python usage
Thank you so much. appreciate it.