Skip to content

Instantly share code, notes, and snippets.

@chasemc
Created December 19, 2018 23:37
Show Gist options
  • Save chasemc/c69c4cb624268f1b50dd0d62539ea7cf to your computer and use it in GitHub Desktop.
Save chasemc/c69c4cb624268f1b50dd0d62539ea7cf to your computer and use it in GitHub Desktop.
auto_prior
---
title: "Sponge Picking"
author: "Chase Clark"
date: "November 4, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(pool)
library(ape)
library(dplyr)
library(magrittr)
library(RSQLite)
library(pool)
library(dendextend)
```
```{r}
binnR <- function (vectorlist, ppm, low, high, increment){
# Create a vector entry for every unique element
longVector <- seq(from = low,
to = high,
by = increment)
# length of vector that will be created
nx <- length(longVector)
# Adjust ppm to decimal tolarance across long vector
toll <- ppm / 10e5 * longVector
# Iterate over the provided list of vectors
lapply(vectorlist, function(vec){
matches <- rep(0, nx)
ry <- 1:length(vec)
for (i in seq_along(vec)) {
# returns abs diff across entire long vector
dif <- abs(longVector - vec[i])
matches[which(dif <= toll)] <- 1
}
matches
})
}
```
All MALDI data up to this point was run through the new version of IDBac, which produces a single sqlite file that contains all the mzml files, processed spectra, etc.
```{r}
db <-"data/Sponge.sqlite"
```
Connect to database
```{r}
db <- pool::dbPool(drv = RSQLite::SQLite(),
dbname = db)
```
In the IDBac software all protein spectra were processed and grouped by average linkage of th cosine similarity scores
Going to cut the tree @ 0.6 height which is likely ~genus level or greater
```{r}
proteinTree <- ape::read.tree("data/2018-11-04.newick")
groups <- dendextend::cutree(proteinTree, h = 0.6)
```
```{r}
proteinGroups <- split(names(groups), as.vector(groups))
```
```{r}
db2 <- dplyr::tbl(db, "IndividualSpectra")
```
get list of small mol peaks, grouped by protein groupings
```{r}
speaks <- lapply(proteinGroups,
function(x){
var <- enquo(x)
db2 %>%
filter(Strain_ID %in% !! var) %>%
filter(!is.na(smallMoleculePeaks)) %>%
select(Strain_ID, smallMoleculePeaks) %>%
collect
})
```
```{r}
speaks <- lapply(speaks, function(x) split(x$smallMoleculePeaks, x$Strain_ID))
jone <- lapply(speaks, function(x){
lapply(x, function(y) sapply(y, function(z) unserialize(memDecompress(z, type = "gzip"))))})
```
```{r}
jone <- lapply(jone, function(zz){
lapply(zz, function(xy){
snrs <- lapply(xy, function(x) x@snr)
snrs <- lapply(snrs, function(x) which(x > 6))
for(i in seq_along(snrs)){
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
}
snrs <- lapply(xy, function(x) x@mass)
snrs <- lapply(snrs, function(x) which(x > 100))
for(i in seq_along(snrs)){
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
}
snrs <- lapply(xy, function(x) x@mass)
snrs <- lapply(snrs, function(x) which(x < 3000))
for(i in seq_along(snrs)){
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]]
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]]
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]]
}
xy
})
})
```
```{r}
jones <- lapply(jone, function(zz){
lapply(zz, function(xy){
xy <- MALDIquant::binPeaks(xy, tolerance=.002)
xy <- MALDIquant::filterPeaks(xy, minFrequency = .5)
MALDIquant::mergeMassPeaks(xy)
})
})
```
```{r}
masses <- lapply(jones, function(x) lapply(x, function(y) y@mass) )
binVec <- lapply(masses, function(x) binnR(vectorlist = x,
ppm = 500,
low = 100,
high = 3000,
increment = .01))
```
```{r}
#
# pp <- lapply(binVec, function(x)
#
# if(length(x) < 2){ return("Only one")
# }else{
# vegan::diversity(t(do.call(cbind, x)))
# }
# )
pp <- lapply(binVec, function(x)
if(length(x) < 2){ return("Only one")
}else{
coop::tcosine(do.call(rbind, x))
}
)
```
Get labels
```{r}
binVec2 <- binVec[-which(pp == "Only one")]
binVec3 <- binVec[which(pp == "Only one")]
onlyone <-unname(unlist(lapply(binVec3, labels)))
pp2<-pp
pp <- pp2[-which(pp2 == "Only one")]
```
___
```{r}
hello <- lapply(binVec2,function(i){
a <- i
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
z3 <- as.numeric(z2 > 0)
sz3<-sum(z3)
prioritized <- NULL
captured <- NULL
while(length(a) > 1){
zz <- sapply(a,
function(x){
y <- z3 - x
1 - (sum(y==1) /sz3)
})
oo <- order(zz, decreasing = T)[[1]]
prioritized <- c(prioritized, names(a)[[oo]])
captured <- c(captured, sort(zz, decreasing = T)[[1]])
z3 <- z3 - a[[oo]]
a <- a[-oo]
}
return(prioritized[captured < .75])
})
q <- c(onlyone, unname(unlist(hello)))
write.csv(q, "prioritized.csv")
```
```{r}
pp<-pp2
```
```{r}
for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){
aa <- 1 - as.dist(pp[[i]])
bb <- hclust(aa, "average")
dd <- as.dendrogram(bb)
dd <- dd %>% set("labels_cex", .7)
plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
try(
dd %>% rect.dendrogram(h=.4,
border = 8, lty = 5, lwd = 2)
)
a <- binVec[[i]]
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
zz <- lapply(seq_along(a),
function(x){
z <- do.call(rbind,a[-x])
z <- colSums(z)
uniqueVal <- which(a[[x]] > z)
lengUniqueVal <- length(uniqueVal)
perc1 <- lengUniqueVal / sum(a[[x]])
perc2 <- lengUniqueVal / sum(z2 > 0)
return(list(perc1 = perc1,
perc2 = perc2))
})
q <- do.call(cbind,zz)
q[1,] <- unlist(q[1,])*-1
p <- match(labels(dd), labels(a))
barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
barplot(unlist(q[2,])[p], add = TRUE, col="blue")
a <- binVec[[i]]
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
z3 <- as.numeric(z2 > 0)
sz3<-sum(z3)
prioritized <- NULL
captured <- NULL
while(length(a) > 1){
zz <- sapply(a,
function(x){
y <- z3 - x
1 - (sum(y==1) /sz3)
})
oo <- order(zz, decreasing = T)[[1]]
prioritized <- c(prioritized, names(a)[[oo]])
captured <- c(captured, sort(zz, decreasing = T)[[1]])
z3 <- z3 - a[[oo]]
a <- a[-oo]
}
plot(captured)
text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")
}
```
```{r}
pathpdf <- "C:/Users/chase/Documents/GitHub/twoLittleSponges/data/hplots/1.pdf"
pdf(file = pathpdf)
for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){
layout(matrix(c(1,1,1,2,2,2,3,3,3), 3,3, byrow = T))
aa <- 1 - as.dist(pp[[i]])
bb <- hclust(aa, "average")
dd <- as.dendrogram(bb)
dd <- dd %>% set("labels_cex", .7)
plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1))
try(
dd %>% rect.dendrogram(h=.4,
border = 8, lty = 5, lwd = 2)
)
a <- binVec[[i]]
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
zz <- lapply(seq_along(a),
function(x){
z <- do.call(rbind,a[-x])
z <- colSums(z)
uniqueVal <- which(a[[x]] > z)
lengUniqueVal <- length(uniqueVal)
perc1 <- lengUniqueVal / sum(a[[x]])
perc2 <- lengUniqueVal / sum(z2 > 0)
return(list(perc1 = perc1,
perc2 = perc2))
})
q <- do.call(cbind,zz)
q[1,] <- unlist(q[1,])*-1
p <- match(labels(dd), labels(a))
barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red")
barplot(unlist(q[2,])[p], add = TRUE, col="blue")
a <- binVec[[i]]
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
z3 <- as.numeric(z2 > 0)
sz3<-sum(z3)
prioritized <- NULL
captured <- NULL
while(length(a) > 1){
zz <- sapply(a,
function(x){
y <- z3 - x
1 - (sum(y==1) /sz3)
})
oo <- order(zz, decreasing = T)[[1]]
prioritized <- c(prioritized, names(a)[[oo]])
captured <- c(captured, sort(zz, decreasing = T)[[1]])
z3 <- z3 - a[[oo]]
a <- a[-oo]
}
plot(captured)
text(1:192, captured, prioritized, cex=0.5, pos=4, col="red")
abline(h=.75, col= "grey")
abline(h=.80, col= "grey")
abline(h=.85, col= "grey")
abline(h=.90, col= "grey")
}
dev.off()
```
```{r}
aa <- lapply(as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 2)) == 1)))), function(x){
a <- binVec[[x]]
z2 <- do.call(rbind,a)
z2 <- colSums(z2)
z3 <- as.numeric(z2 > 0)
sz3<-sum(z3)
prioritized <- NULL
captured <- NULL
while(length(a) > 1){
zz <- sapply(a,
function(x){
y <- z3 - x
1 - (sum(y==1) /sz3)
})
oo <- order(zz, decreasing = T)[[1]]
prioritized <- c(prioritized, names(a)[[oo]])
captured <- c(captured, sort(zz, decreasing = T)[[1]])
z3 <- z3 - a[[oo]]
a <- a[-oo]
}
list(prioritized =prioritized, captured = captured)
})
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment