Created
May 7, 2013 19:23
-
-
Save gregcaporaso/5535393 to your computer and use it in GitHub Desktop.
A single-use script for cleaning up taxonomy strings from a nifH database generated by Gady and Buckley (2012) and exported by the authors from ARB. This is being use to hack together a 13_5 release of a nifH database for use with PrimerProspector and QIIME. The database reference is:
PLoS One. 2012;7(7):e42149. doi: 10.1371/journal.pone.0042149…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 07 May 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2011, The QIIME project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.6.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from collections import defaultdict | |
from qiime.util import parse_command_line_parameters, make_option | |
script_info = {} | |
script_info['brief_description'] = "" | |
script_info['script_description'] = "" | |
script_info['script_usage'] = [("","","")] | |
script_info['output_description']= "" | |
script_info['required_options'] = [ | |
make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'), | |
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'), | |
] | |
script_info['optional_options'] = [] | |
script_info['version'] = __version__ | |
def main(): | |
option_parser, opts, args =\ | |
parse_command_line_parameters(**script_info) | |
f = open(opts.input_fp,'U') | |
g = open(opts.output_fp,'w') | |
l1 = defaultdict(int) | |
l2 = defaultdict(int) | |
for line in f: | |
fields = line.strip().split('\t') | |
id_ = fields[0] | |
try: | |
tax = fields[1] | |
except IndexError: | |
tax = 'Unclassified;Unclassified' | |
l1['Unclassified'] += 1 | |
l2['Unclassified'] += 1 | |
else: | |
tax = tax.split('/') | |
if len(tax) == 1 and tax[0].startswith('Species'): | |
tax = 'Unclassified;Unclassified' | |
l1['Unclassified'] += 1 | |
l2['Unclassified'] += 1 | |
elif len(tax) == 1: | |
l1[tax[0]] += 1 | |
l2['Unclassified'] += 1 | |
tax = '%s;Unclassified' % tax[0] | |
elif len(tax) != 2: | |
raise ValueError, "Bad number of fields: %s" % repr(tax) | |
elif tax[1] == 'IA': | |
l1['%s/IA' % tax[0]] += 1 | |
l2['Unclassified'] += 1 | |
tax = '%s/IA;Unclassified' % tax[0] | |
else: | |
l1[tax[0]] += 1 | |
l2[tax[1]] += 1 | |
tax = ';'.join(tax) | |
g.write('%s\t%s\n' % (id_,tax)) | |
g.close() | |
total_l1 = 0 | |
print "L1 summary" | |
print "==========" | |
for k, v in l1.items(): | |
print k, v | |
total_l1 += v | |
print "Total L1: ", total_l1 | |
print "" | |
total_l2 = 0 | |
print "L2 summary" | |
print "==========" | |
for k, v in l2.items(): | |
print k, v | |
total_l2 += v | |
print "Total L2: ", total_l2 | |
print "" | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment