ashenkin · November 29, 2015 10:14
diff --git a/query_higher_taxa_classes.r b/query_higher_taxa_classes.r
 query_higher_taxa_classes <- function(species_list, known = "genus", order = c("dataframe", "unique_sp")) {
    # Pass in a character vector of species, genera, families, or whatever (the search is flexible)
    # Returns a dataframe with the columns: query, db, family, order, subdivision
    # The dataframe returned is guaranteed to be in the same order as the species list passed in if order is "dataframe"
    order = match.arg(order)
    
    library(taxize)
    library(plyr)
    species_list = sub("^([^ ]*).*$","\\1",species_list) # just take the top level name before the space
    # remove short names that clog the taxon query - replace later
        which_short = which(nchar(species_list) < 3)
        short_names = species_list[which_short]
        species_list[which_short] = "TEMPTAXON"
    # agglomerate to be more efficient
        species_list_uniq = unique(species_list)
    if (known == "genus") {
        highertaxa <- tax_name(species_list_uniq,get=c("family", "order", "subdivision"), db = "both")
    } else if (known == "family") {
        highertaxa <- tax_name(species_list_uniq,get=c("order", "subdivision"), db = "both")
    }
    
    highertaxa_ncbi = subset(highertaxa, db == "ncbi")
    highertaxa_itis = subset(highertaxa, db == "itis")
    
    merdat <- merge(highertaxa_ncbi, highertaxa_ncbi, by="query")  # seems self-documenting
    merdat$family.y[ is.na(merdat$family.y) ] <- merdat$family.x[ is.na(merdat$family.y) ]
    merdat$order.y[ is.na(merdat$order.y) ] <- merdat$order.x[ is.na(merdat$order.y) ]
    merdat$subdivision.y[ is.na(merdat$subdivision.y) ] <- merdat$subdivision.x[ is.na(merdat$subdivision.y) ]
    merdat = merdat[,c(1,grep("\\.x",names(merdat)))]
    names(merdat) = sub("\\.x","",names(merdat))
    highertaxa = merdat
    
    species_list[which_short] = short_names # replace short names removed earlier
    ret_df = join(data.frame(query = species_list), highertaxa, by = "query", type = "left") # NA's will be filled in for short names that don't have a match in highertaxa now
    if (order == "dataframe") {
        return(ret_df)
    } else if (order == "unique_sp") {
        return(ret_df[-duplicated(ret_df$query),])
    }
 }
	query_higher_taxa_classes <- function(species_list, known = "genus", order = c("dataframe", "unique_sp")) {
	# Pass in a character vector of species, genera, families, or whatever (the search is flexible)
	# Returns a dataframe with the columns: query, db, family, order, subdivision
	# The dataframe returned is guaranteed to be in the same order as the species list passed in if order is "dataframe"
	order = match.arg(order)

	library(taxize)
	library(plyr)
	species_list = sub("^([^ ]).$","\\1",species_list) # just take the top level name before the space
	# remove short names that clog the taxon query - replace later
	which_short = which(nchar(species_list) < 3)
	short_names = species_list[which_short]
	species_list[which_short] = "TEMPTAXON"
	# agglomerate to be more efficient
	species_list_uniq = unique(species_list)
	if (known == "genus") {
	highertaxa <- tax_name(species_list_uniq,get=c("family", "order", "subdivision"), db = "both")
	} else if (known == "family") {
	highertaxa <- tax_name(species_list_uniq,get=c("order", "subdivision"), db = "both")
	}

	highertaxa_ncbi = subset(highertaxa, db == "ncbi")
	highertaxa_itis = subset(highertaxa, db == "itis")

	merdat <- merge(highertaxa_ncbi, highertaxa_ncbi, by="query") # seems self-documenting
	merdat$family.y[ is.na(merdat$family.y) ] <- merdat$family.x[ is.na(merdat$family.y) ]
	merdat$order.y[ is.na(merdat$order.y) ] <- merdat$order.x[ is.na(merdat$order.y) ]
	merdat$subdivision.y[ is.na(merdat$subdivision.y) ] <- merdat$subdivision.x[ is.na(merdat$subdivision.y) ]
	merdat = merdat[,c(1,grep("\\.x",names(merdat)))]
	names(merdat) = sub("\\.x","",names(merdat))
	highertaxa = merdat

	species_list[which_short] = short_names # replace short names removed earlier
	ret_df = join(data.frame(query = species_list), highertaxa, by = "query", type = "left") # NA's will be filled in for short names that don't have a match in highertaxa now
	if (order == "dataframe") {
	return(ret_df)
	} else if (order == "unique_sp") {
	return(ret_df[-duplicated(ret_df$query),])
	}
	}