Last active
December 17, 2015 08:09
-
-
Save standage/5578116 to your computer and use it in GitHub Desktop.
Traverse the taxonomy to determine the classification of a given taxon at a particular taxonomic rank.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| Copyright (c) 2013-2014, Daniel S. Standage <daniel.standage@gmail.com> | |
| Permission to use, copy, modify, and/or distribute this software for any | |
| purpose with or without fee is hereby granted, provided that the above | |
| copyright notice and this permission notice appear in all copies. | |
| THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
| WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
| MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
| ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
| WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
| ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
| OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
| */ | |
| // For compiling try: gcc -Wall -O3 -o taxtrav taxtrav.c | |
| #include <assert.h> | |
| #include <ctype.h> | |
| #include <getopt.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| //------------------------------------------------------------------------------ | |
| // Basic hash map data structure | |
| //------------------------------------------------------------------------------ | |
| typedef struct _hashmap_item_ | |
| { | |
| char *key; | |
| void *value; | |
| struct _hashmap_item_ *next; | |
| } HashmapItem; | |
| typedef struct | |
| { | |
| int size; | |
| int usage; | |
| HashmapItem **table; | |
| } Hashmap; | |
| // Method prototypes | |
| int hashmap_add(Hashmap *hash, char *key, void *value); | |
| void hashmap_delete(Hashmap *hash, void (*valuefreefunc)(void *)); | |
| void *hashmap_get(Hashmap *hash, char *key); | |
| unsigned int hashmap_hash(Hashmap *hash, char *key); | |
| char **hashmap_keys(Hashmap *hash); | |
| Hashmap *hashmap_new(int size); | |
| int hashmap_size(Hashmap *hash); | |
| // Method implementations | |
| int hashmap_add(Hashmap *hash, char *key, void *value) | |
| { | |
| HashmapItem *item; | |
| unsigned int hashval; | |
| if(hashmap_get(hash, key) != NULL) | |
| return 0; | |
| hashval = hashmap_hash(hash, key); | |
| item = malloc(sizeof(HashmapItem)); | |
| if(item == NULL) | |
| return 0; | |
| item->key = strdup(key); | |
| item->value = value; | |
| item->next = hash->table[hashval]; | |
| hash->table[hashval] = item; | |
| hash->usage += 1; | |
| return 1; | |
| } | |
| void hashmap_delete(Hashmap *hash, void (*valuefreefunc)(void *)) | |
| { | |
| HashmapItem *item, *temp; | |
| int i; | |
| if(hash == NULL) | |
| return; | |
| for(i = 0; i < hash->size; i++) | |
| { | |
| item = hash->table[i]; | |
| while(item != NULL) | |
| { | |
| temp = item; | |
| item = item->next; | |
| free(temp->key); | |
| if(valuefreefunc != NULL) | |
| valuefreefunc(temp->value); | |
| free(temp); | |
| } | |
| } | |
| free(hash->table); | |
| free(hash); | |
| } | |
| void *hashmap_get(Hashmap *hash, char *key) | |
| { | |
| HashmapItem *item; | |
| unsigned int hashval = hashmap_hash(hash, key); | |
| for(item = hash->table[hashval]; item != NULL; item = item->next) | |
| { | |
| if(strcmp(key, item->key) == 0) | |
| return item->value; | |
| } | |
| return NULL; | |
| } | |
| unsigned int hashmap_hash(Hashmap *hash, char *key) | |
| { | |
| unsigned int hashval = 0; | |
| for(; *key != '\0'; key++) | |
| hashval = *key + (hashval << 5) - hashval; | |
| return hashval % hash->size; | |
| } | |
| char **hashmap_keys(Hashmap *hash) | |
| { | |
| char **keys; | |
| HashmapItem *item; | |
| int i, j; | |
| keys = malloc(sizeof(char *) * hash->usage); | |
| j = 0; | |
| for(i = 0; i < hash->size; i++) | |
| { | |
| item = hash->table[i]; | |
| while(item != NULL) | |
| { | |
| keys[j++] = item->key; | |
| item = item->next; | |
| } | |
| } | |
| return keys; | |
| } | |
| Hashmap *hashmap_new(int size) | |
| { | |
| Hashmap *hash; | |
| int i; | |
| if(size < 1) | |
| return NULL; | |
| hash = malloc(sizeof(HashmapItem)); | |
| if(hash == NULL) | |
| return NULL; | |
| hash->table = malloc(sizeof(HashmapItem *) * size); | |
| if(hash->table == NULL) | |
| return NULL; | |
| for(i = 0; i < size; i++) | |
| hash->table[i] = NULL; | |
| hash->size = size; | |
| hash->usage = 0; | |
| return hash; | |
| } | |
| int hashmap_size(Hashmap *hash) | |
| { | |
| return hash->usage; | |
| } | |
| //------------------------------------------------------------------------------ | |
| // Data structure to store info about particular taxonomic units | |
| //------------------------------------------------------------------------------ | |
| typedef struct _taxon_ | |
| { | |
| char *id; | |
| char *rank; | |
| char *name; | |
| char *parentid; | |
| struct _taxon_ *parent; | |
| } Taxon; | |
| // Method prototypes | |
| void taxon_delete(Taxon *t); | |
| Taxon *taxon_new(const char *taxid, const char *parentid, const char *rank); | |
| // Method implementations | |
| void taxon_delete(Taxon *t) | |
| { | |
| free(t->id); | |
| free(t->rank); | |
| free(t->parentid); | |
| if(t->name != NULL) | |
| free(t->name); | |
| free(t); | |
| t = NULL; | |
| } | |
| Taxon *taxon_new(const char *taxid, const char *parentid, const char *rank) | |
| { | |
| Taxon *t = malloc(sizeof(Taxon)); | |
| t->id = strdup(taxid); | |
| t->rank = strdup(rank); | |
| t->parentid = strdup(parentid); | |
| t->name = NULL; | |
| t->parent = NULL; | |
| return t; | |
| } | |
| //------------------------------------------------------------------------------ | |
| // Main implementation | |
| //------------------------------------------------------------------------------ | |
| void print_usage(FILE *outstream) | |
| { | |
| fprintf( outstream, "\n" | |
| "taxtrav: traverse the taxonomy to determine the classification of a given\n" | |
| " taxon at a particular taxonomic rank\n\n" | |
| "Usage: taxtrav [options] input.file\n" | |
| " Options:\n" | |
| " -c|--complement: if the --filter option is enabled, report only\n" | |
| " results that do NOT match the specified taxon;\n" | |
| " for example, if '-c -f order=Hymenoptera' is\n" | |
| " specified, program will only report results for\n" | |
| " input taxa that do not fall within the order\n" | |
| " Hymenoptera\n" | |
| " -d|--delim: STRING list of delimiters separating values in the input\n" | |
| " file; default is ' ,\\t\\n' character\n" | |
| " -f|--filter: STRING restrict results with a filter of the form\n" | |
| " 'rank=taxon_name'; for example, 'order=Hymenoptera'\n" | |
| " will only report results for input taxon IDs that\n" | |
| " fall within the order Hymenoptera; default is no\n" | |
| " filter\n" | |
| " -h|--help print this help message and exit\n" | |
| " -m|--names: FILE path to file containing taxon names; default is\n" | |
| " './names.dmp'; can be downloaded from\n" | |
| " ftp://ftp.ncbi.nih.gov/pub/taxonomy/\n" | |
| " -n|--nodes: FILE path to file containing taxonomy graph; default is\n" | |
| " './nodes.dmp'; can be downloaded from\n" | |
| " ftp://ftp.ncbi.nih.gov/pub/taxonomy/\n" | |
| " -r|--rank: STRING desired taxonomic rank to report for each input\n" | |
| " taxon; default is 'order'; to see all possible\n" | |
| " taxonomic ranks, try the following command\n\n" | |
| " cut -f 3 -d '|' < nodes.dmp \\\n" | |
| " | perl -ne 's/^\\s*(.+)\\s*$/$1/;print $_,\"\\n\"' \\\n" | |
| " | sort | uniq -c | sort -rn \n\n" | |
| "The input file is expected to be a tab-, space-, or comma-delimited text\n" | |
| "file, where the first value of each line is a GenBank taxonomy ID.\n\n" | |
| "The output is 3 comma-delimited values for each line of input: the GenBank\n" | |
| "ID of the input taxon, the GenBank ID of the taxon's classification at the\n" | |
| "requested taxonomic rank, and the human-readable classification of the taxon\n" | |
| "at the requested taxonomic rank.\n\n" ); | |
| } | |
| FILE *myfopen(const char *filename, const char *mode) | |
| { | |
| FILE *fh = fopen(filename, mode); | |
| if(fh == NULL) | |
| { | |
| fprintf(stderr, "error opening file '%s'\n", filename); | |
| exit(1); | |
| } | |
| return fh; | |
| } | |
| char *trim_str(char *str) | |
| { | |
| while(isspace(str[0])) | |
| str += 1; | |
| int i = strlen(str) - 1; | |
| while(isspace(str[i]) && i >= 0) | |
| { | |
| str[i] = '\0'; | |
| i--; | |
| } | |
| return str; | |
| } | |
| Taxon *get_parent_by_rank(Taxon *t, const char *rank) | |
| { | |
| if(strcmp(t->id, "1") == 0) // Base case: found root of taxonomy | |
| return t; | |
| if(strcmp(t->rank, rank) == 0) // Found desired level of taxonomy | |
| return t; | |
| return get_parent_by_rank(t->parent, rank); // Recurse | |
| } | |
| int main(int argc, char **argv) | |
| { | |
| int complement = 0; | |
| const char *delims = ", \t\n"; | |
| const char *filterrank = NULL; | |
| const char *filtertaxon = NULL; | |
| const char *nodesfile = "nodes.dmp"; | |
| const char *namesfile = "names.dmp"; | |
| const char *targetrank = "order"; | |
| // Parse options | |
| int opt = 0; | |
| int optindex = 0; | |
| const char *optstr = "cd:f:hn:m:r:"; | |
| char *temparg; | |
| const struct option init_options[] = | |
| { | |
| { "complement", no_argument, NULL, 'c' }, | |
| { "delim", required_argument, NULL, 'd' }, | |
| { "filter", required_argument, NULL, 'f' }, | |
| { "help", no_argument, NULL, 'h' }, | |
| { "names", required_argument, NULL, 'm' }, | |
| { "nodes", required_argument, NULL, 'n' }, | |
| { "rank", required_argument, NULL, 'r' }, | |
| { NULL, no_argument, NULL, 0 }, | |
| }; | |
| for( opt = getopt_long(argc, argv, optstr, init_options, &optindex); | |
| opt != -1; | |
| opt = getopt_long(argc, argv, optstr, init_options, &optindex) ) | |
| { | |
| switch(opt) | |
| { | |
| case 'c': | |
| complement = 1; | |
| break; | |
| case 'd': | |
| delims = optarg; | |
| break; | |
| case 'f': | |
| temparg = strdup(optarg); | |
| filterrank = strtok(optarg, "="); | |
| filtertaxon = strtok(NULL, "="); | |
| if(filterrank == NULL || filtertaxon == NULL) | |
| { | |
| fprintf(stderr, "error parsing filter '%s'\n", temparg); | |
| return 1; | |
| } | |
| free(temparg); | |
| break; | |
| case 'h': | |
| print_usage(stdout); | |
| return 0; | |
| break; | |
| case 'm': | |
| namesfile = optarg; | |
| break; | |
| case 'n': | |
| nodesfile = optarg; | |
| break; | |
| case 'r': | |
| targetrank = optarg; | |
| break; | |
| default: | |
| break; | |
| } | |
| } | |
| // Validate options and arguments | |
| int filenum = argc - optind; | |
| if(filenum != 1) | |
| { | |
| fprintf(stderr, "error: expected 1 input file, got %d\n", filenum); | |
| print_usage(stderr); | |
| return 1; | |
| } | |
| const char *infile = argv[optind]; | |
| // Add all taxa to a hashtable | |
| FILE *taxnodes = myfopen(nodesfile, "r"); | |
| Hashmap *nodes = hashmap_new(500000); | |
| char buffer[1024]; | |
| while( (fgets(buffer, 1024, taxnodes)) != NULL ) | |
| { | |
| char *nid = strtok(buffer, "|"); | |
| char *pid = strtok(NULL, "|"); | |
| char *rank = strtok(NULL, "|"); | |
| nid = trim_str(nid); | |
| pid = trim_str(pid); | |
| rank = trim_str(rank); | |
| Taxon *t = taxon_new(nid, pid, rank); | |
| int store_success = hashmap_add(nodes, nid, t); | |
| if(store_success == 0) | |
| { | |
| fprintf(stderr, "error storing %p (id=%s) to hashmap\n", t, t->id); | |
| return 1; | |
| } | |
| } | |
| fclose(taxnodes); | |
| // Resolve parent/child relationships | |
| char **tids = hashmap_keys(nodes); | |
| int i; | |
| for(i = 0; i < hashmap_size(nodes); i++) | |
| { | |
| Taxon *t = hashmap_get(nodes, tids[i]); | |
| if(t == NULL) | |
| { | |
| fprintf(stderr, "error fetching taxon with tid '%s'\n", tids[i]); | |
| return 1; | |
| } | |
| Taxon *parent = hashmap_get(nodes, t->parentid); | |
| if(parent == NULL) | |
| { | |
| fprintf(stderr, "error fetching parent with id '%s'\n", t->parentid); | |
| return 1; | |
| } | |
| t->parent = parent; | |
| } | |
| free(tids); | |
| // Add names to taxa | |
| FILE *taxnames = myfopen(namesfile, "r"); | |
| while( (fgets(buffer, 1024, taxnames)) != NULL ) | |
| { | |
| char *tid = strtok(buffer, "|"); | |
| char *name = strtok(NULL, "|"); | |
| char *uniqname = strtok(NULL, "|"); | |
| char *class = strtok(NULL, "|"); | |
| tid = trim_str(tid); | |
| name = trim_str(name); | |
| uniqname = trim_str(uniqname); | |
| class = trim_str(class); | |
| Taxon *t = hashmap_get(nodes, tid); | |
| if(t == NULL) | |
| { | |
| fprintf(stderr, "error fetching node '%s|%s' for naming\n", tid, name); | |
| return 1; | |
| } | |
| if(strcmp(class, "scientific name") == 0) | |
| t->name = strdup(name); | |
| } | |
| fclose(taxnames); | |
| // Determine classification of each taxon at the specified rank | |
| FILE *instream = myfopen(infile, "r"); | |
| while( (fgets(buffer, 1024, instream)) != NULL ) | |
| { | |
| char *taxonid = strtok(buffer, delims); | |
| Taxon *taxon = hashmap_get(nodes, taxonid); | |
| if(taxon == NULL) | |
| { | |
| fprintf(stderr, "error fetching taxon by id '%s'\n", taxonid); | |
| return 1; | |
| } | |
| Taxon *target = get_parent_by_rank(taxon, targetrank); | |
| if(target == NULL) | |
| { | |
| fprintf(stderr, "error resolving %s for species %s\n", targetrank, taxonid); | |
| return 1; | |
| } | |
| else | |
| { | |
| if(filterrank != NULL && filtertaxon != NULL) | |
| { | |
| Taxon *filtertarget = get_parent_by_rank(taxon, filterrank); | |
| int matchfilter = strcmp(filtertarget->name, filtertaxon) == 0; | |
| if(!(matchfilter ^ complement)) | |
| continue; | |
| } | |
| printf("%s,%s,%s\n", taxonid, target->id, target->name); | |
| } | |
| } | |
| fclose(instream); | |
| hashmap_delete(nodes, (void (*)(void *))taxon_delete); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment