Created
October 24, 2016 20:35
-
-
Save mortonjt/620f8106af3d381cf33676f5400544d8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Filter sequences from a biom table based on a fasta file | |
used for AG bloom filtering | |
""" | |
# amnonscript | |
__version__ = "1.2" | |
import biom | |
import argparse | |
import sys | |
def iterfastaseqs(filename): | |
""" | |
iterate a fasta file and return header,sequence | |
input: | |
filename - the fasta file name | |
output: | |
seq - the sequence | |
header - the header | |
""" | |
fl=open(filename,"rU") | |
cseq='' | |
chead='' | |
for cline in fl: | |
if cline[0]=='>': | |
if chead: | |
yield(cseq,chead) | |
cseq='' | |
chead=cline[1:].rstrip() | |
else: | |
cseq+=cline.strip() | |
if cseq: | |
yield(cseq,chead) | |
fl.close() | |
def filterbiombyfasta(tablefilename,outfilename,fastafilename,ignoretablelen=False): | |
print('loading biom table %s' % tablefilename) | |
table=biom.load_table(tablefilename) | |
print('table has %d sequences.' % table.shape[0]) | |
print('loading fasta file %s' % fastafilename) | |
seqset=set() | |
numtot=0 | |
# get the read len | |
if not ignoretablelen: | |
tseqs=table.ids('observation') | |
seqlen=len(tseqs[0]) | |
for cseq in tseqs: | |
if seqlen!=len(cseq): | |
ignoretablelen=True | |
print('Sequence length not uniform in table! bad sequence is %s' % cseq) | |
break | |
for cseq,chead in iterfastaseqs(fastafilename): | |
numtot+=1 | |
if not ignoretablelen: | |
cseq=cseq[:seqlen] | |
if table.exists(cseq,axis='observation'): | |
seqset.add(cseq) | |
print('loaded %d sequences. %d found in table. filtering' % (numtot,len(seqset))) | |
# filter removing these sequences in place | |
table.filter(list(seqset),axis='observation',invert=True) | |
print('%d sequences left. saving' % table.shape[0]) | |
with biom.util.biom_open(outfilename, 'w') as f: | |
table.to_hdf5(f, "filterbiomseqs") | |
def main(argv): | |
parser=argparse.ArgumentParser(description='Filter sequences from biom table using a fasta file. Version '+__version__) | |
parser.add_argument('-i','--inputtable',help='input biom table file name where each of the observations correspond to a sequence.') | |
parser.add_argument('-o','--output',help='output biom file name with the bloom sequences removed.') | |
parser.add_argument('-f','--fasta',help='FASTA file containing bloom sequences that need to be removed from the input table.') | |
parser.add_argument('--ignore-table-seq-length',help="don't trim fasta file sequences to table read length",action='store_true') | |
args=parser.parse_args(argv) | |
filterbiombyfasta(args.inputtable,args.output,args.fasta,args.ignore_table_seq_length) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment