Skip to content

Instantly share code, notes, and snippets.

@cgpu
Last active October 20, 2018 23:13
Show Gist options
  • Select an option

  • Save cgpu/a59239bd341deab82115807283583ee9 to your computer and use it in GitHub Desktop.

Select an option

Save cgpu/a59239bd341deab82115807283583ee9 to your computer and use it in GitHub Desktop.
Script that contains functions for computeSimilarity() and matchFeatures()
#### Function for uniquely matching features between two datasets ####
#
# 1) computing features similarity matrix
# 2) assigning fatures across the two datasets (Hungarian algorithm)
# 3) returning a
#requires clue for the Hungarian algorithm
library(clue)
# let's define the similarity metric
# `peaks` contains both dataframes
compute_similarity <- function(peaks, rt_tolerance, mass_tolerance){
#extracting info
dataset1_time <- peaks[ , 'rtmed_1']
dataset2_time <- peaks[ , 'rtmed_2']
dataset1_mass <- peaks[ , 'mzmed_1']
dataset2_mass <- peaks[ , 'mzmed_2']
#time part of the similarity
time_part <- exp( -1 * ((dataset2_time - dataset1_time) ^ 2)/(2 * (rt_tolerance^2)) )
#mass part of the similarity
mass_part <- exp( -1 * ((dataset2_mass - dataset1_mass) ^ 2)/(2 * (mass_tolerance^2)) )
#final similarity
similarity <- time_part * mass_part;
}
#main function
#dataset1: metabolites to be validate
#dataset2: new metabolites
matchingFeatures <- function(dataset1,
dataset2,
rt_tolerance = 2.50, # default threshold
mass_tolerance = 0.01, # default mz threshold
threshold = 0.8 ) # default similarity threshold)
{
#restricting to the relevant information
dataset1_peaks <- dataset1[ , c('name', 'mzmed', 'rtmed')];
dataset2_peaks <- dataset2[ , c('name', 'mzmed', 'rtmed')];
#transforming time in minutes
dataset1_peaks['rtmed'] <- dataset1_peaks['rtmed'] / 60;
dataset2_peaks['rtmed'] <- dataset2_peaks['rtmed'] / 60;
#cartesian product
peaks <- merge(dataset1_peaks, dataset2_peaks, by = NULL, suffixes = c('_1', '_2'));
#computing the similarities
similarities <- compute_similarity(peaks, rt_tolerance, mass_tolerance);
#filtering out similarities below the theshold
similarities[similarities < threshold] <- 0;
#rearranging the similaritis as a matrix
numFeatures1 <- dim(dataset1)[1];
numFeatures2 <- dim(dataset2)[1];
similarities <- matrix(similarities, numFeatures1, numFeatures2);
rownames(similarities) <- dataset1$name;
colnames(similarities) <- dataset2$name;
#removing zeros columns and rows
rowTotal <- rowSums(similarities)
colTotal <- colSums(similarities)
toRemove <- which(rowTotal == 0)
if(length(toRemove) > 0){
similarities <- similarities[-toRemove, ];
}
toRemove <- which(colTotal == 0)
if(length(toRemove) > 0){
similarities <- similarities[ , -toRemove];
}
#invoking the Hungarian algorithm
solution <- solve_LSAP(similarities, maximum = TRUE)
#formatting the results
solution <- data.frame(seq_along(solution), as.vector(solution), 0);
names(solution) <- c('dataset1', 'dataset2', 'Similarity');
for(i in 1:(dim(solution))[1]){
solution$Similarity[i] <- similarities[solution$dataset1[i], solution$dataset2[i]];
}
solution$dataset1 <- rownames(similarities)[solution$dataset1];
solution$dataset2 <- colnames(similarities)[solution$dataset2];
#removing focibly assigned metabolites
toRemove <- which(solution$Similarity == 0);
if(length(toRemove) > 0){
solution <- solution[-toRemove,]
}
#return the solution
return(solution)
}
# Testing
# dataset1 <- data.frame(matrix(runif(3000), 1500, 2))
# dataset2 <- data.frame(matrix(runif(4000), 2000, 2))
# dataset1$name <- paste0('r', 1:1500)
# dataset2$name <- paste0('c', 1:2000)
# colnames(dataset1) <- c('mzmed', 'rtmed', 'name')
# colnames(dataset2) <- c('mzmed', 'rtmed', 'name')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment