Skip to content

Instantly share code, notes, and snippets.

@wdecoster
Last active November 25, 2016 14:33
Show Gist options
  • Select an option

  • Save wdecoster/5782affc0753c9b89a05306fe942a021 to your computer and use it in GitHub Desktop.

Select an option

Save wdecoster/5782affc0753c9b89a05306fe942a021 to your computer and use it in GitHub Desktop.
import sys, os, glob
from Bio import SeqIO
def getfiles():
files = glob.glob("*.csv") + glob.glob(".tfa")
print("Found {} files to filter".format(len(files)))
return(files)
def filter(inputfile):
countAll = 0
countGood = 0
goodproteins=[]
for record in SeqIO.parse(sys.argv[1],'fasta'):
countAll += 1 #For each record in the file, increment countAll
if len(record.seq) > 20:
countGood += 1 #For each good record in the file, increment countGood
goodproteins.append(record)
SeqIO.write(goodproteins,filtered_file,"fasta")
print("Filtering retained {} out of {} sequences in this file".format(countGood, countAll))
def main():
if sys.argv[1] == 'dir': #Alternatively, could check if the path in sys.argv[1] is a file or a directory and act accordingly
files = getfiles()
else:
if os.path.isfile(sys.argv[1]):
files = [sys.argv[1]]
else:
sys.exit("Invalid input!")
for input in files:
filter(input)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment