Created
July 8, 2015 18:43
-
-
Save walterst/f6f08f6583bb320bb10d to your computer and use it in GitHub Desktop.
See help text below about usage. Script was created to create 90% majority taxonomy strings for all sequence taxa strings in the Silva 119 release.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# USAGE | |
# python create_majority_taxonomy.py X Y Z A | |
# where X is the taxonomy mapping file for all NR seqs, Y is the representative | |
# file (i.e. one of the rep_set/ files with the 119 release), Z is the OTU | |
# mapping file created from running pick_otus.py, and A is the output | |
# consensus mapping file | |
from sys import argv | |
from cogent.parse.fasta import MinimalFastaParser | |
silva_taxa = open(argv[1], "U") | |
id_to_taxa = {} | |
for line in silva_taxa: | |
curr_id = line.split()[0].strip() | |
curr_taxa = " ".join(line.split()[1:]).strip() | |
id_to_taxa[curr_id] = curr_taxa | |
rep_set_fasta = open(argv[2], "U") | |
ordered_ids = [] | |
for label,seq in MinimalFastaParser(rep_set_fasta): | |
ordered_ids.append(label) | |
ordered_ids = set(ordered_ids) | |
otu_mapping = open(argv[3], "U") | |
matched_ids = {} | |
for line in otu_mapping: | |
if len(line.strip()) == 0: | |
continue | |
curr_line = line.strip().split('\t') | |
curr_otu = curr_line[0] | |
all_seqs = curr_line[1:] | |
for seq in all_seqs: | |
if seq in ordered_ids: | |
matched_ids[seq] = all_seqs | |
output_taxa_mapping = open(argv[4], "w") | |
for id in ordered_ids: | |
taxa_data = [] | |
final_taxa_string = [] | |
for curr_seq in matched_ids[id]: | |
taxa_data.append(id_to_taxa[curr_seq].split(';')[::-1]) | |
levels = len(taxa_data[0]) | |
final_taxa_string = [] | |
for n in range(levels): | |
curr_taxa_strings = [] | |
for tax in taxa_data: | |
curr_taxa_strings.append(tax[n]) | |
# A length of set == 1 should always be unambiguous, otherwise | |
# needs to be <= the length of the full list * 0.10 | |
if len(set(curr_taxa_strings)) == 1: | |
match = True | |
elif float(len(set(curr_taxa_strings))) <= float(0.1 * len(curr_taxa_strings)): | |
match = True | |
else: | |
match = False | |
if match: | |
final_taxa_string.append(curr_taxa_strings[0]) | |
else: | |
final_taxa_string.append("Ambiguous_taxa") | |
fixed_taxa_string = ";".join(final_taxa_string[::-1]) | |
output_taxa_mapping.write("%s\t%s\n" % (id, fixed_taxa_string)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment