mortonjt · October 24, 2016 20:35
diff --git a/filterbiomseqs.py b/filterbiomseqs.py
 #!/usr/bin/env python


 """
 Filter sequences from a biom table based on a fasta file
 used for AG bloom filtering
 """

 # amnonscript

 __version__ = "1.2"

 import biom

 import argparse
 import sys


 def iterfastaseqs(filename):
 	"""
 	iterate a fasta file and return header,sequence
 	input:
 	filename - the fasta file name

 	output:
 	seq - the sequence
 	header - the header
 	"""

 	fl=open(filename,"rU")
 	cseq=''
 	chead=''
 	for cline in fl:
 		if cline[0]=='>':
 			if chead:
 				yield(cseq,chead)
 			cseq=''
 			chead=cline[1:].rstrip()
 		else:
 			cseq+=cline.strip()
 	if cseq:
 		yield(cseq,chead)
 	fl.close()


 def filterbiombyfasta(tablefilename,outfilename,fastafilename,ignoretablelen=False):
 	print('loading biom table %s' % tablefilename)
 	table=biom.load_table(tablefilename)
 	print('table has %d sequences.' % table.shape[0])
 	print('loading fasta file %s' % fastafilename)
 	seqset=set()
 	numtot=0
 	# get the read len
 	if not ignoretablelen:
 		tseqs=table.ids('observation')
 		seqlen=len(tseqs[0])
 		for cseq in tseqs:
 			if seqlen!=len(cseq):
 				ignoretablelen=True
 				print('Sequence length not uniform in table! bad sequence is %s' % cseq)
 				break
 	for cseq,chead in iterfastaseqs(fastafilename):
 		numtot+=1
 		if not ignoretablelen:
 			cseq=cseq[:seqlen]
 		if table.exists(cseq,axis='observation'):
 			seqset.add(cseq)
 	print('loaded %d sequences. %d found in table. filtering' % (numtot,len(seqset)))
 	# filter removing these sequences in place
 	table.filter(list(seqset),axis='observation',invert=True)
 	print('%d sequences left. saving' % table.shape[0])
 	with biom.util.biom_open(outfilename, 'w') as f:
 		table.to_hdf5(f, "filterbiomseqs")


 def main(argv):
 	parser=argparse.ArgumentParser(description='Filter sequences from biom table using a fasta file. Version '+__version__)
 	parser.add_argument('-i','--inputtable',help='input biom table file name where each of the observations correspond to a sequence.')
 	parser.add_argument('-o','--output',help='output biom file name with the bloom sequences removed.')
 	parser.add_argument('-f','--fasta',help='FASTA file containing bloom sequences that need to be removed from the input table.')
 	parser.add_argument('--ignore-table-seq-length',help="don't trim fasta file sequences to table read length",action='store_true')

 	args=parser.parse_args(argv)

 	filterbiombyfasta(args.inputtable,args.output,args.fasta,args.ignore_table_seq_length)

 if __name__ == "__main__":
 	main(sys.argv[1:])
	#!/usr/bin/env python


	"""
	Filter sequences from a biom table based on a fasta file
	used for AG bloom filtering
	"""

	# amnonscript

	__version__ = "1.2"

	import biom

	import argparse
	import sys


	def iterfastaseqs(filename):
	"""
	iterate a fasta file and return header,sequence
	input:
	filename - the fasta file name

	output:
	seq - the sequence
	header - the header
	"""

	fl=open(filename,"rU")
	cseq=''
	chead=''
	for cline in fl:
	if cline[0]=='>':
	if chead:
	yield(cseq,chead)
	cseq=''
	chead=cline[1:].rstrip()
	else:
	cseq+=cline.strip()
	if cseq:
	yield(cseq,chead)
	fl.close()


	def filterbiombyfasta(tablefilename,outfilename,fastafilename,ignoretablelen=False):
	print('loading biom table %s' % tablefilename)
	table=biom.load_table(tablefilename)
	print('table has %d sequences.' % table.shape[0])
	print('loading fasta file %s' % fastafilename)
	seqset=set()
	numtot=0
	# get the read len
	if not ignoretablelen:
	tseqs=table.ids('observation')
	seqlen=len(tseqs[0])
	for cseq in tseqs:
	if seqlen!=len(cseq):
	ignoretablelen=True
	print('Sequence length not uniform in table! bad sequence is %s' % cseq)
	break
	for cseq,chead in iterfastaseqs(fastafilename):
	numtot+=1
	if not ignoretablelen:
	cseq=cseq[:seqlen]
	if table.exists(cseq,axis='observation'):
	seqset.add(cseq)
	print('loaded %d sequences. %d found in table. filtering' % (numtot,len(seqset)))
	# filter removing these sequences in place
	table.filter(list(seqset),axis='observation',invert=True)
	print('%d sequences left. saving' % table.shape[0])
	with biom.util.biom_open(outfilename, 'w') as f:
	table.to_hdf5(f, "filterbiomseqs")


	def main(argv):
	parser=argparse.ArgumentParser(description='Filter sequences from biom table using a fasta file. Version '+__version__)
	parser.add_argument('-i','--inputtable',help='input biom table file name where each of the observations correspond to a sequence.')
	parser.add_argument('-o','--output',help='output biom file name with the bloom sequences removed.')
	parser.add_argument('-f','--fasta',help='FASTA file containing bloom sequences that need to be removed from the input table.')
	parser.add_argument('--ignore-table-seq-length',help="don't trim fasta file sequences to table read length",action='store_true')

	args=parser.parse_args(argv)

	filterbiombyfasta(args.inputtable,args.output,args.fasta,args.ignore_table_seq_length)

	if __name__ == "__main__":
	main(sys.argv[1:])
No results found