chasemc · December 19, 2018 23:37
diff --git a/auto_prior.rmd b/auto_prior.rmd
 ---
 title: "Sponge Picking"
 author: "Chase Clark"
 date: "November 4, 2018"
 output: html_document
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```

 ```{r}
 library(pool)
 library(ape)
 library(dplyr)
 library(magrittr)
 library(RSQLite)
 library(pool)
 library(dendextend)
 ```


 ```{r}

 binnR <- function (vectorlist, ppm, low, high, increment){
  
  # Create a vector entry for every unique element
  longVector <- seq(from = low,
                    to = high,
                    by = increment)
  # length of vector that will be created
  nx <- length(longVector)
  # Adjust ppm to decimal tolarance across long vector
  toll <- ppm / 10e5 * longVector
  # Iterate over the provided list of vectors
  lapply(vectorlist, function(vec){
    
    matches <- rep(0, nx)
    ry <- 1:length(vec)
    for (i in seq_along(vec)) {
      # returns abs diff across entire long vector
      dif <- abs(longVector - vec[i])
      matches[which(dif <= toll)] <- 1
    }
    
    matches
    
  })
 }
 ```


 All MALDI data up to this point was run through the new version of IDBac, which produces a single sqlite file that contains all the mzml files, processed spectra, etc.
 ```{r}
 db <-"data/Sponge.sqlite"
 ```

 Connect to database
 ```{r}
 db <- pool::dbPool(drv = RSQLite::SQLite(),
                    dbname = db)
 ```


 In the IDBac software all protein spectra were processed and grouped by average linkage of th cosine similarity scores
 Going to cut the tree @ 0.6 height which is likely ~genus level or greater
 ```{r}
 proteinTree <- ape::read.tree("data/2018-11-04.newick")

 groups <- dendextend::cutree(proteinTree, h = 0.6)
 ```


 ```{r}
 proteinGroups <- split(names(groups), as.vector(groups))
 ```




 ```{r}
 db2 <- dplyr::tbl(db, "IndividualSpectra")
 ```


 get list of small mol peaks, grouped by protein groupings
 ```{r}

 speaks <- lapply(proteinGroups, 
                 function(x){
                   
                   var <- enquo(x) 
  
  db2 %>% 
    filter(Strain_ID %in% !! var) %>% 
    filter(!is.na(smallMoleculePeaks)) %>% 
    select(Strain_ID, smallMoleculePeaks) %>% 
    collect
    

 })

 ```




 ```{r}
 speaks <- lapply(speaks, function(x) split(x$smallMoleculePeaks, x$Strain_ID))

 jone <- lapply(speaks, function(x){
 lapply(x, function(y) sapply(y, function(z) unserialize(memDecompress(z, type = "gzip"))))})


 ```

 ```{r}
 jone <- lapply(jone, function(zz){
        
  lapply(zz, function(xy){
        snrs <- lapply(xy, function(x) x@snr)
        snrs <- lapply(snrs, function(x) which(x > 6))
        
        for(i in seq_along(snrs)){
          
          xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
          xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
          xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
        }
        
        snrs <- lapply(xy, function(x) x@mass)
        snrs <- lapply(snrs, function(x) which(x > 100))
        
        for(i in seq_along(snrs)){
          
          xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
          xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
          xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
        }
        
        
        snrs <- lapply(xy, function(x) x@mass)
        snrs <- lapply(snrs, function(x) which(x < 3000))
        
        for(i in seq_along(snrs)){
          
          xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
          xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
          xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
        }
        
        xy
  })
 })





 ```





 ```{r}
  
 jones <- lapply(jone, function(zz){
  lapply(zz, function(xy){

    xy <- MALDIquant::binPeaks(xy, tolerance=.002)
    
    xy <- MALDIquant::filterPeaks(xy, minFrequency = .5)
    
    MALDIquant::mergeMassPeaks(xy)


 })
 })

 ```


 ```{r}
 masses <- lapply(jones, function(x) lapply(x, function(y) y@mass) )

 binVec <- lapply(masses, function(x) binnR(vectorlist = x,
                                           ppm = 500,
                                           low = 100,
                                           high = 3000,
                                           increment = .01))

 ```





 ```{r}

 # 
 # pp <- lapply(binVec, function(x) 
 #   
 #   if(length(x) < 2){ return("Only one")
 #     }else{
 #       vegan::diversity(t(do.call(cbind, x)))
 #       }
 #   )


 pp <- lapply(binVec, function(x) 
  
  if(length(x) < 2){ return("Only one")
    }else{
      
      
      coop::tcosine(do.call(rbind, x))
      
      
            }
  )



 ```

 Get labels
 ```{r}
 binVec2 <- binVec[-which(pp == "Only one")]

 binVec3 <- binVec[which(pp == "Only one")]

 onlyone <-unname(unlist(lapply(binVec3, labels)))

 pp2<-pp

 pp <- pp2[-which(pp2 == "Only one")]

 ```

 ___


 ```{r}


 hello <- lapply(binVec2,function(i){
  
   a <- i


  z2 <- do.call(rbind,a)
  z2 <- colSums(z2)
  z3 <- as.numeric(z2 > 0)
  sz3<-sum(z3)
  
 prioritized <- NULL 
 captured <- NULL
  while(length(a) > 1){

      
        zz <- sapply(a,
                     function(x){
        
                       y <-  z3 - x
                       
                  1 - (sum(y==1) /sz3)
        })

        
      oo <-   order(zz, decreasing = T)[[1]]
      prioritized <- c(prioritized, names(a)[[oo]])
      captured <- c(captured, sort(zz, decreasing = T)[[1]])
       z3 <- z3 - a[[oo]]
       
       a <- a[-oo]
      
  
  }

 
 return(prioritized[captured < .75])
 

 })

 q <- c(onlyone, unname(unlist(hello)))

 write.csv(q, "prioritized.csv")
 ```


 ```{r}
 pp<-pp2
 ```


 ```{r}

 for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){

  
 aa <- 1 - as.dist(pp[[i]])
 bb <- hclust(aa, "average")
 dd <- as.dendrogram(bb)  
 dd <- dd %>% set("labels_cex", .7)


 plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
 try(
 dd %>% rect.dendrogram(h=.4, 
                           border = 8, lty = 5, lwd = 2) 
 )

 a <- binVec[[i]]



  z2 <- do.call(rbind,a)
     z2 <- colSums(z2)
     
 zz <- lapply(seq_along(a),
             function(x){

     z <- do.call(rbind,a[-x])
     z <- colSums(z)
   
     
     uniqueVal <- which(a[[x]] > z)
     lengUniqueVal <- length(uniqueVal)
     
     perc1 <- lengUniqueVal / sum(a[[x]])
     perc2 <- lengUniqueVal / sum(z2 > 0)

 return(list(perc1 = perc1,
            perc2 = perc2))
     
 })



 q <- do.call(cbind,zz)

 q[1,] <- unlist(q[1,])*-1
 p <- match(labels(dd), labels(a))
 barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
 barplot(unlist(q[2,])[p], add = TRUE, col="blue")



  
  
  
  
 a <- binVec[[i]]


  z2 <- do.call(rbind,a)
  z2 <- colSums(z2)
  z3 <- as.numeric(z2 > 0)
  sz3<-sum(z3)
  
 prioritized <- NULL 
 captured <- NULL
  while(length(a) > 1){

      
        zz <- sapply(a,
                     function(x){
        
                       y <-  z3 - x
                       
                  1 - (sum(y==1) /sz3)
        })

        
      oo <-   order(zz, decreasing = T)[[1]]
      prioritized <- c(prioritized, names(a)[[oo]])
      captured <- c(captured, sort(zz, decreasing = T)[[1]])
       z3 <- z3 - a[[oo]]
       
       a <- a[-oo]
      
     
      
               
               
               
  
  }

 plot(captured)
 text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")

 }
 ```





















 ```{r}
 pathpdf <- "C:/Users/chase/Documents/GitHub/twoLittleSponges/data/hplots/1.pdf"
 pdf(file = pathpdf)


 for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){
 layout(matrix(c(1,1,1,2,2,2,3,3,3), 3,3, byrow = T))
  
   aa <- 1 - as.dist(pp[[i]])
 bb <- hclust(aa, "average")
 dd <- as.dendrogram(bb)  
 dd <- dd %>% set("labels_cex", .7)


 plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
 try(
 dd %>% rect.dendrogram(h=.4, 
                           border = 8, lty = 5, lwd = 2) 
 )

 a <- binVec[[i]]



  z2 <- do.call(rbind,a)
     z2 <- colSums(z2)
     
 zz <- lapply(seq_along(a),
             function(x){

     z <- do.call(rbind,a[-x])
     z <- colSums(z)
   
     
     uniqueVal <- which(a[[x]] > z)
     lengUniqueVal <- length(uniqueVal)
     
     perc1 <- lengUniqueVal / sum(a[[x]])
     perc2 <- lengUniqueVal / sum(z2 > 0)

 return(list(perc1 = perc1,
            perc2 = perc2))
     
 })



 q <- do.call(cbind,zz)

 q[1,] <- unlist(q[1,])*-1
 p <- match(labels(dd), labels(a))
 barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
 barplot(unlist(q[2,])[p], add = TRUE, col="blue")



  
  
  
  
 a <- binVec[[i]]


  z2 <- do.call(rbind,a)
  z2 <- colSums(z2)
  z3 <- as.numeric(z2 > 0)
  sz3<-sum(z3)
  
 prioritized <- NULL 
 captured <- NULL
  while(length(a) > 1){

      
        zz <- sapply(a,
                     function(x){
        
                       y <-  z3 - x
                       
                  1 - (sum(y==1) /sz3)
        })

        
      oo <-   order(zz, decreasing = T)[[1]]
      prioritized <- c(prioritized, names(a)[[oo]])
      captured <- c(captured, sort(zz, decreasing = T)[[1]])
       z3 <- z3 - a[[oo]]
       
       a <- a[-oo]
      
     
      
               
               
               
  
  }

 plot(captured)
 text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")
 abline(h=.75, col= "grey")
 abline(h=.80, col= "grey")
 abline(h=.85, col= "grey")
 abline(h=.90, col= "grey")

 }

 dev.off()

 ```







 ```{r}

 aa <- lapply(as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 2)) == 1)))), function(x){
  
  a <- binVec[[x]]


  z2 <- do.call(rbind,a)
  z2 <- colSums(z2)
  z3 <- as.numeric(z2 > 0)
  sz3<-sum(z3)
  
 prioritized <- NULL 
 captured <- NULL
  while(length(a) > 1){

      
        zz <- sapply(a,
                     function(x){
        
                       y <-  z3 - x
                       
                  1 - (sum(y==1) /sz3)
        })

        
      oo <-   order(zz, decreasing = T)[[1]]
      prioritized <- c(prioritized, names(a)[[oo]])
      captured <- c(captured, sort(zz, decreasing = T)[[1]])
       z3 <- z3 - a[[oo]]
       
       a <- a[-oo]
     
  } 
 
 list(prioritized =prioritized, captured = captured)
 })
     
      
 ```
	---
	title: "Sponge Picking"
	author: "Chase Clark"
	date: "November 4, 2018"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```

	```{r}
	library(pool)
	library(ape)
	library(dplyr)
	library(magrittr)
	library(RSQLite)
	library(pool)
	library(dendextend)
	```


	```{r}

	binnR <- function (vectorlist, ppm, low, high, increment){

	# Create a vector entry for every unique element
	longVector <- seq(from = low,
	to = high,
	by = increment)
	# length of vector that will be created
	nx <- length(longVector)
	# Adjust ppm to decimal tolarance across long vector
	toll <- ppm / 10e5 * longVector
	# Iterate over the provided list of vectors
	lapply(vectorlist, function(vec){

	matches <- rep(0, nx)
	ry <- 1:length(vec)
	for (i in seq_along(vec)) {
	# returns abs diff across entire long vector
	dif <- abs(longVector - vec[i])
	matches[which(dif <= toll)] <- 1
	}

	matches

	})
	}
	```


	All MALDI data up to this point was run through the new version of IDBac, which produces a single sqlite file that contains all the mzml files, processed spectra, etc.
	```{r}
	db <-"data/Sponge.sqlite"
	```

	Connect to database
	```{r}
	db <- pool::dbPool(drv = RSQLite::SQLite(),
	dbname = db)
	```


	In the IDBac software all protein spectra were processed and grouped by average linkage of th cosine similarity scores
	Going to cut the tree @ 0.6 height which is likely ~genus level or greater
	```{r}
	proteinTree <- ape::read.tree("data/2018-11-04.newick")

	groups <- dendextend::cutree(proteinTree, h = 0.6)
	```


	```{r}
	proteinGroups <- split(names(groups), as.vector(groups))
	```




	```{r}
	db2 <- dplyr::tbl(db, "IndividualSpectra")
	```


	get list of small mol peaks, grouped by protein groupings
	```{r}

	speaks <- lapply(proteinGroups,
	function(x){

	var <- enquo(x)

	db2 %>%
	filter(Strain_ID %in% !! var) %>%
	filter(!is.na(smallMoleculePeaks)) %>%
	select(Strain_ID, smallMoleculePeaks) %>%
	collect


	})

	```




	```{r}
	speaks <- lapply(speaks, function(x) split(x$smallMoleculePeaks, x$Strain_ID))

	jone <- lapply(speaks, function(x){
	lapply(x, function(y) sapply(y, function(z) unserialize(memDecompress(z, type = "gzip"))))})


	```

	```{r}
	jone <- lapply(jone, function(zz){

	lapply(zz, function(xy){
	snrs <- lapply(xy, function(x) x@snr)
	snrs <- lapply(snrs, function(x) which(x > 6))

	for(i in seq_along(snrs)){

	xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
	xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
	xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
	}

	snrs <- lapply(xy, function(x) x@mass)
	snrs <- lapply(snrs, function(x) which(x > 100))

	for(i in seq_along(snrs)){

	xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
	xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
	xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
	}


	snrs <- lapply(xy, function(x) x@mass)
	snrs <- lapply(snrs, function(x) which(x < 3000))

	for(i in seq_along(snrs)){

	xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
	xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
	xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
	}

	xy
	})
	})





	```





	```{r}

	jones <- lapply(jone, function(zz){
	lapply(zz, function(xy){

	xy <- MALDIquant::binPeaks(xy, tolerance=.002)

	xy <- MALDIquant::filterPeaks(xy, minFrequency = .5)

	MALDIquant::mergeMassPeaks(xy)


	})
	})

	```


	```{r}
	masses <- lapply(jones, function(x) lapply(x, function(y) y@mass) )

	binVec <- lapply(masses, function(x) binnR(vectorlist = x,
	ppm = 500,
	low = 100,
	high = 3000,
	increment = .01))

	```





	```{r}

	#
	# pp <- lapply(binVec, function(x)
	#
	# if(length(x) < 2){ return("Only one")
	# }else{
	# vegan::diversity(t(do.call(cbind, x)))
	# }
	# )


	pp <- lapply(binVec, function(x)

	if(length(x) < 2){ return("Only one")
	}else{


	coop::tcosine(do.call(rbind, x))


	}
	)



	```

	Get labels
	```{r}
	binVec2 <- binVec[-which(pp == "Only one")]

	binVec3 <- binVec[which(pp == "Only one")]

	onlyone <-unname(unlist(lapply(binVec3, labels)))

	pp2<-pp

	pp <- pp2[-which(pp2 == "Only one")]

	```

	___


	```{r}


	hello <- lapply(binVec2,function(i){

	a <- i


	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)
	z3 <- as.numeric(z2 > 0)
	sz3<-sum(z3)

	prioritized <- NULL
	captured <- NULL
	while(length(a) > 1){


	zz <- sapply(a,
	function(x){

	y <- z3 - x

	1 - (sum(y==1) /sz3)
	})


	oo <- order(zz, decreasing = T)[[1]]
	prioritized <- c(prioritized, names(a)[[oo]])
	captured <- c(captured, sort(zz, decreasing = T)[[1]])
	z3 <- z3 - a[[oo]]

	a <- a[-oo]


	}


	return(prioritized[captured < .75])


	})

	q <- c(onlyone, unname(unlist(hello)))

	write.csv(q, "prioritized.csv")
	```


	```{r}
	pp<-pp2
	```


	```{r}

	for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){


	aa <- 1 - as.dist(pp[[i]])
	bb <- hclust(aa, "average")
	dd <- as.dendrogram(bb)
	dd <- dd %>% set("labels_cex", .7)


	plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
	try(
	dd %>% rect.dendrogram(h=.4,
	border = 8, lty = 5, lwd = 2)
	)

	a <- binVec[[i]]



	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)

	zz <- lapply(seq_along(a),
	function(x){

	z <- do.call(rbind,a[-x])
	z <- colSums(z)


	uniqueVal <- which(a[[x]] > z)
	lengUniqueVal <- length(uniqueVal)

	perc1 <- lengUniqueVal / sum(a[[x]])
	perc2 <- lengUniqueVal / sum(z2 > 0)

	return(list(perc1 = perc1,
	perc2 = perc2))

	})



	q <- do.call(cbind,zz)

	q[1,] <- unlist(q[1,])*-1
	p <- match(labels(dd), labels(a))
	barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
	barplot(unlist(q[2,])[p], add = TRUE, col="blue")







	a <- binVec[[i]]


	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)
	z3 <- as.numeric(z2 > 0)
	sz3<-sum(z3)

	prioritized <- NULL
	captured <- NULL
	while(length(a) > 1){


	zz <- sapply(a,
	function(x){

	y <- z3 - x

	1 - (sum(y==1) /sz3)
	})


	oo <- order(zz, decreasing = T)[[1]]
	prioritized <- c(prioritized, names(a)[[oo]])
	captured <- c(captured, sort(zz, decreasing = T)[[1]])
	z3 <- z3 - a[[oo]]

	a <- a[-oo]







	}

	plot(captured)
	text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")

	}
	```





















	```{r}
	pathpdf <- "C:/Users/chase/Documents/GitHub/twoLittleSponges/data/hplots/1.pdf"
	pdf(file = pathpdf)


	for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){
	layout(matrix(c(1,1,1,2,2,2,3,3,3), 3,3, byrow = T))

	aa <- 1 - as.dist(pp[[i]])
	bb <- hclust(aa, "average")
	dd <- as.dendrogram(bb)
	dd <- dd %>% set("labels_cex", .7)


	plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
	try(
	dd %>% rect.dendrogram(h=.4,
	border = 8, lty = 5, lwd = 2)
	)

	a <- binVec[[i]]



	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)

	zz <- lapply(seq_along(a),
	function(x){

	z <- do.call(rbind,a[-x])
	z <- colSums(z)


	uniqueVal <- which(a[[x]] > z)
	lengUniqueVal <- length(uniqueVal)

	perc1 <- lengUniqueVal / sum(a[[x]])
	perc2 <- lengUniqueVal / sum(z2 > 0)

	return(list(perc1 = perc1,
	perc2 = perc2))

	})



	q <- do.call(cbind,zz)

	q[1,] <- unlist(q[1,])*-1
	p <- match(labels(dd), labels(a))
	barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
	barplot(unlist(q[2,])[p], add = TRUE, col="blue")







	a <- binVec[[i]]


	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)
	z3 <- as.numeric(z2 > 0)
	sz3<-sum(z3)

	prioritized <- NULL
	captured <- NULL
	while(length(a) > 1){


	zz <- sapply(a,
	function(x){

	y <- z3 - x

	1 - (sum(y==1) /sz3)
	})


	oo <- order(zz, decreasing = T)[[1]]
	prioritized <- c(prioritized, names(a)[[oo]])
	captured <- c(captured, sort(zz, decreasing = T)[[1]])
	z3 <- z3 - a[[oo]]

	a <- a[-oo]







	}

	plot(captured)
	text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")
	abline(h=.75, col= "grey")
	abline(h=.80, col= "grey")
	abline(h=.85, col= "grey")
	abline(h=.90, col= "grey")

	}

	dev.off()

	```







	```{r}

	aa <- lapply(as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 2)) == 1)))), function(x){

	a <- binVec[[x]]


	z2 <- do.call(rbind,a)
	z2 <- colSums(z2)
	z3 <- as.numeric(z2 > 0)
	sz3<-sum(z3)

	prioritized <- NULL
	captured <- NULL
	while(length(a) > 1){


	zz <- sapply(a,
	function(x){

	y <- z3 - x

	1 - (sum(y==1) /sz3)
	})


	oo <- order(zz, decreasing = T)[[1]]
	prioritized <- c(prioritized, names(a)[[oo]])
	captured <- c(captured, sort(zz, decreasing = T)[[1]])
	z3 <- z3 - a[[oo]]

	a <- a[-oo]

	}

	list(prioritized =prioritized, captured = captured)
	})


	```