gregcaporaso · May 7, 2013 19:23
diff --git a/clean_nifH_tax.py b/clean_nifH_tax.py
 #!/usr/bin/env python
 # File created on 07 May 2013
 from __future__ import division

 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2011, The QIIME project"
 __credits__ = ["Greg Caporaso"]
 __license__ = "GPL"
 __version__ = "1.6.0-dev"
 __maintainer__ = "Greg Caporaso"
 __email__ = "[email protected]"
 __status__ = "Development"

 from collections import defaultdict
 from qiime.util import parse_command_line_parameters, make_option

 script_info = {}
 script_info['brief_description'] = ""
 script_info['script_description'] = ""
 script_info['script_usage'] = [("","","")]
 script_info['output_description']= ""
 script_info['required_options'] = [
 make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'),
 make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
 ]
 script_info['optional_options'] = []
 script_info['version'] = __version__



 def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    f = open(opts.input_fp,'U')
    g = open(opts.output_fp,'w')
    l1 = defaultdict(int)
    l2 = defaultdict(int)
    for line in f:
        fields = line.strip().split('\t')
        id_ = fields[0]
        try:
            tax = fields[1]
        except IndexError:
            tax = 'Unclassified;Unclassified'
            l1['Unclassified'] += 1
            l2['Unclassified'] += 1
        else:
            tax = tax.split('/')
            if len(tax) == 1 and tax[0].startswith('Species'):
                tax = 'Unclassified;Unclassified'
                l1['Unclassified'] += 1
                l2['Unclassified'] += 1
            elif len(tax) == 1:
                l1[tax[0]] += 1
                l2['Unclassified'] += 1
                tax = '%s;Unclassified' % tax[0]
            elif len(tax) != 2:
                raise ValueError, "Bad number of fields: %s" % repr(tax)
            elif tax[1] == 'IA':
                l1['%s/IA' % tax[0]] += 1
                l2['Unclassified'] += 1
                tax = '%s/IA;Unclassified' % tax[0]
            else:
                l1[tax[0]] += 1
                l2[tax[1]] += 1
                tax = ';'.join(tax)
        g.write('%s\t%s\n' % (id_,tax))
    g.close()
    
    total_l1 = 0
    print "L1 summary"
    print "=========="
    for k, v in l1.items():
        print k, v
        total_l1 += v
    print "Total L1: ", total_l1
    print ""
    
    total_l2 = 0
    print "L2 summary"
    print "=========="
    for k, v in l2.items():
        print k, v
        total_l2 += v
    print "Total L2: ", total_l2
    print ""
        

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# File created on 07 May 2013
	from __future__ import division

	__author__ = "Greg Caporaso"
	__copyright__ = "Copyright 2011, The QIIME project"
	__credits__ = ["Greg Caporaso"]
	__license__ = "GPL"
	__version__ = "1.6.0-dev"
	__maintainer__ = "Greg Caporaso"
	__email__ = "[email protected]"
	__status__ = "Development"

	from collections import defaultdict
	from qiime.util import parse_command_line_parameters, make_option

	script_info = {}
	script_info['brief_description'] = ""
	script_info['script_description'] = ""
	script_info['script_usage'] = [("","","")]
	script_info['output_description']= ""
	script_info['required_options'] = [
	make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'),
	make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
	]
	script_info['optional_options'] = []
	script_info['version'] = __version__



	def main():
	option_parser, opts, args =\
	parse_command_line_parameters(**script_info)

	f = open(opts.input_fp,'U')
	g = open(opts.output_fp,'w')
	l1 = defaultdict(int)
	l2 = defaultdict(int)
	for line in f:
	fields = line.strip().split('\t')
	id_ = fields[0]
	try:
	tax = fields[1]
	except IndexError:
	tax = 'Unclassified;Unclassified'
	l1['Unclassified'] += 1
	l2['Unclassified'] += 1
	else:
	tax = tax.split('/')
	if len(tax) == 1 and tax[0].startswith('Species'):
	tax = 'Unclassified;Unclassified'
	l1['Unclassified'] += 1
	l2['Unclassified'] += 1
	elif len(tax) == 1:
	l1[tax[0]] += 1
	l2['Unclassified'] += 1
	tax = '%s;Unclassified' % tax[0]
	elif len(tax) != 2:
	raise ValueError, "Bad number of fields: %s" % repr(tax)
	elif tax[1] == 'IA':
	l1['%s/IA' % tax[0]] += 1
	l2['Unclassified'] += 1
	tax = '%s/IA;Unclassified' % tax[0]
	else:
	l1[tax[0]] += 1
	l2[tax[1]] += 1
	tax = ';'.join(tax)
	g.write('%s\t%s\n' % (id_,tax))
	g.close()

	total_l1 = 0
	print "L1 summary"
	print "=========="
	for k, v in l1.items():
	print k, v
	total_l1 += v
	print "Total L1: ", total_l1
	print ""

	total_l2 = 0
	print "L2 summary"
	print "=========="
	for k, v in l2.items():
	print k, v
	total_l2 += v
	print "Total L2: ", total_l2
	print ""


	if __name__ == "__main__":
	main()