Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created May 7, 2013 19:23
Show Gist options
  • Save gregcaporaso/5535393 to your computer and use it in GitHub Desktop.
Save gregcaporaso/5535393 to your computer and use it in GitHub Desktop.
A single-use script for cleaning up taxonomy strings from a nifH database generated by Gady and Buckley (2012) and exported by the authors from ARB. This is being use to hack together a 13_5 release of a nifH database for use with PrimerProspector and QIIME. The database reference is: PLoS One. 2012;7(7):e42149. doi: 10.1371/journal.pone.0042149…
#!/usr/bin/env python
# File created on 07 May 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.6.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from collections import defaultdict
from qiime.util import parse_command_line_parameters, make_option
script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = [("","","")]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'),
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
]
script_info['optional_options'] = []
script_info['version'] = __version__
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
f = open(opts.input_fp,'U')
g = open(opts.output_fp,'w')
l1 = defaultdict(int)
l2 = defaultdict(int)
for line in f:
fields = line.strip().split('\t')
id_ = fields[0]
try:
tax = fields[1]
except IndexError:
tax = 'Unclassified;Unclassified'
l1['Unclassified'] += 1
l2['Unclassified'] += 1
else:
tax = tax.split('/')
if len(tax) == 1 and tax[0].startswith('Species'):
tax = 'Unclassified;Unclassified'
l1['Unclassified'] += 1
l2['Unclassified'] += 1
elif len(tax) == 1:
l1[tax[0]] += 1
l2['Unclassified'] += 1
tax = '%s;Unclassified' % tax[0]
elif len(tax) != 2:
raise ValueError, "Bad number of fields: %s" % repr(tax)
elif tax[1] == 'IA':
l1['%s/IA' % tax[0]] += 1
l2['Unclassified'] += 1
tax = '%s/IA;Unclassified' % tax[0]
else:
l1[tax[0]] += 1
l2[tax[1]] += 1
tax = ';'.join(tax)
g.write('%s\t%s\n' % (id_,tax))
g.close()
total_l1 = 0
print "L1 summary"
print "=========="
for k, v in l1.items():
print k, v
total_l1 += v
print "Total L1: ", total_l1
print ""
total_l2 = 0
print "L2 summary"
print "=========="
for k, v in l2.items():
print k, v
total_l2 += v
print "Total L2: ", total_l2
print ""
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment