Last active
December 8, 2017 23:08
-
-
Save sminot/680a354fbf1f3bb25220577718bd0187 to your computer and use it in GitHub Desktop.
Make a taxonomy file compatible with mothur
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
"""Make a taxonomy file compatible with mothur.""" | |
import os | |
import sys | |
import pandas as pd | |
if len(sys.argv) != 4: | |
print("Please specify the seq_info.csv, tax_info.csv, and output.tsv files") | |
# Get the filepaths | |
seqinfo = sys.argv[1] | |
assert os.path.exists(seqinfo) | |
taxinfo = sys.argv[2] | |
assert os.path.exists(taxinfo) | |
output = sys.argv[3] | |
# Read in the tables | |
taxinfo = pd.read_table(taxinfo, sep=',') | |
assert "tax_id" in taxinfo | |
taxinfo.set_index("tax_id", inplace=True) | |
seqinfo = pd.read_table(seqinfo, sep=',') | |
# Function to generate the taxonomy string for a certain taxid | |
def taxonomy_string(taxid, taxinfo, sep=';', root=1): | |
tax_string = [] | |
while taxid in taxinfo.index.values: | |
tax_string.append(taxinfo.loc[taxid, "tax_name"]) | |
if taxid == root: | |
break | |
if taxinfo.loc[taxid, "parent_id"] == taxid: | |
break | |
taxid = taxinfo.loc[taxid, "parent_id"] | |
assert tax_string[-1] == taxinfo.loc[root, "tax_name"] | |
tax_string = tax_string[::-1] | |
return sep.join(tax_string) + sep | |
# Write a table with the sequence name and taxonomy string for each sequence | |
cache = {} | |
with open(output, "wt") as fo: | |
for ix, r in seqinfo.iterrows(): | |
if r["tax_id"] not in cache: | |
cache[r["tax_id"]] = taxonomy_string(r["tax_id"], taxinfo) | |
fo.write("{}\t{}\n".format(r["seqname"], cache[r["tax_id"]])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment