Last active
October 20, 2018 23:13
-
-
Save cgpu/a59239bd341deab82115807283583ee9 to your computer and use it in GitHub Desktop.
Script that contains functions for computeSimilarity() and matchFeatures()
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #### Function for uniquely matching features between two datasets #### | |
| # | |
| # 1) computing features similarity matrix | |
| # 2) assigning fatures across the two datasets (Hungarian algorithm) | |
| # 3) returning a | |
| #requires clue for the Hungarian algorithm | |
| library(clue) | |
| # let's define the similarity metric | |
| # `peaks` contains both dataframes | |
| compute_similarity <- function(peaks, rt_tolerance, mass_tolerance){ | |
| #extracting info | |
| dataset1_time <- peaks[ , 'rtmed_1'] | |
| dataset2_time <- peaks[ , 'rtmed_2'] | |
| dataset1_mass <- peaks[ , 'mzmed_1'] | |
| dataset2_mass <- peaks[ , 'mzmed_2'] | |
| #time part of the similarity | |
| time_part <- exp( -1 * ((dataset2_time - dataset1_time) ^ 2)/(2 * (rt_tolerance^2)) ) | |
| #mass part of the similarity | |
| mass_part <- exp( -1 * ((dataset2_mass - dataset1_mass) ^ 2)/(2 * (mass_tolerance^2)) ) | |
| #final similarity | |
| similarity <- time_part * mass_part; | |
| } | |
| #main function | |
| #dataset1: metabolites to be validate | |
| #dataset2: new metabolites | |
| matchingFeatures <- function(dataset1, | |
| dataset2, | |
| rt_tolerance = 2.50, # default threshold | |
| mass_tolerance = 0.01, # default mz threshold | |
| threshold = 0.8 ) # default similarity threshold) | |
| { | |
| #restricting to the relevant information | |
| dataset1_peaks <- dataset1[ , c('name', 'mzmed', 'rtmed')]; | |
| dataset2_peaks <- dataset2[ , c('name', 'mzmed', 'rtmed')]; | |
| #transforming time in minutes | |
| dataset1_peaks['rtmed'] <- dataset1_peaks['rtmed'] / 60; | |
| dataset2_peaks['rtmed'] <- dataset2_peaks['rtmed'] / 60; | |
| #cartesian product | |
| peaks <- merge(dataset1_peaks, dataset2_peaks, by = NULL, suffixes = c('_1', '_2')); | |
| #computing the similarities | |
| similarities <- compute_similarity(peaks, rt_tolerance, mass_tolerance); | |
| #filtering out similarities below the theshold | |
| similarities[similarities < threshold] <- 0; | |
| #rearranging the similaritis as a matrix | |
| numFeatures1 <- dim(dataset1)[1]; | |
| numFeatures2 <- dim(dataset2)[1]; | |
| similarities <- matrix(similarities, numFeatures1, numFeatures2); | |
| rownames(similarities) <- dataset1$name; | |
| colnames(similarities) <- dataset2$name; | |
| #removing zeros columns and rows | |
| rowTotal <- rowSums(similarities) | |
| colTotal <- colSums(similarities) | |
| toRemove <- which(rowTotal == 0) | |
| if(length(toRemove) > 0){ | |
| similarities <- similarities[-toRemove, ]; | |
| } | |
| toRemove <- which(colTotal == 0) | |
| if(length(toRemove) > 0){ | |
| similarities <- similarities[ , -toRemove]; | |
| } | |
| #invoking the Hungarian algorithm | |
| solution <- solve_LSAP(similarities, maximum = TRUE) | |
| #formatting the results | |
| solution <- data.frame(seq_along(solution), as.vector(solution), 0); | |
| names(solution) <- c('dataset1', 'dataset2', 'Similarity'); | |
| for(i in 1:(dim(solution))[1]){ | |
| solution$Similarity[i] <- similarities[solution$dataset1[i], solution$dataset2[i]]; | |
| } | |
| solution$dataset1 <- rownames(similarities)[solution$dataset1]; | |
| solution$dataset2 <- colnames(similarities)[solution$dataset2]; | |
| #removing focibly assigned metabolites | |
| toRemove <- which(solution$Similarity == 0); | |
| if(length(toRemove) > 0){ | |
| solution <- solution[-toRemove,] | |
| } | |
| #return the solution | |
| return(solution) | |
| } | |
| # Testing | |
| # dataset1 <- data.frame(matrix(runif(3000), 1500, 2)) | |
| # dataset2 <- data.frame(matrix(runif(4000), 2000, 2)) | |
| # dataset1$name <- paste0('r', 1:1500) | |
| # dataset2$name <- paste0('c', 1:2000) | |
| # colnames(dataset1) <- c('mzmed', 'rtmed', 'name') | |
| # colnames(dataset2) <- c('mzmed', 'rtmed', 'name') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment