Created
December 19, 2018 23:37
-
-
Save chasemc/c69c4cb624268f1b50dd0d62539ea7cf to your computer and use it in GitHub Desktop.
auto_prior
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Sponge Picking" | |
author: "Chase Clark" | |
date: "November 4, 2018" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
```{r} | |
library(pool) | |
library(ape) | |
library(dplyr) | |
library(magrittr) | |
library(RSQLite) | |
library(pool) | |
library(dendextend) | |
``` | |
```{r} | |
binnR <- function (vectorlist, ppm, low, high, increment){ | |
# Create a vector entry for every unique element | |
longVector <- seq(from = low, | |
to = high, | |
by = increment) | |
# length of vector that will be created | |
nx <- length(longVector) | |
# Adjust ppm to decimal tolarance across long vector | |
toll <- ppm / 10e5 * longVector | |
# Iterate over the provided list of vectors | |
lapply(vectorlist, function(vec){ | |
matches <- rep(0, nx) | |
ry <- 1:length(vec) | |
for (i in seq_along(vec)) { | |
# returns abs diff across entire long vector | |
dif <- abs(longVector - vec[i]) | |
matches[which(dif <= toll)] <- 1 | |
} | |
matches | |
}) | |
} | |
``` | |
All MALDI data up to this point was run through the new version of IDBac, which produces a single sqlite file that contains all the mzml files, processed spectra, etc. | |
```{r} | |
db <-"data/Sponge.sqlite" | |
``` | |
Connect to database | |
```{r} | |
db <- pool::dbPool(drv = RSQLite::SQLite(), | |
dbname = db) | |
``` | |
In the IDBac software all protein spectra were processed and grouped by average linkage of th cosine similarity scores | |
Going to cut the tree @ 0.6 height which is likely ~genus level or greater | |
```{r} | |
proteinTree <- ape::read.tree("data/2018-11-04.newick") | |
groups <- dendextend::cutree(proteinTree, h = 0.6) | |
``` | |
```{r} | |
proteinGroups <- split(names(groups), as.vector(groups)) | |
``` | |
```{r} | |
db2 <- dplyr::tbl(db, "IndividualSpectra") | |
``` | |
get list of small mol peaks, grouped by protein groupings | |
```{r} | |
speaks <- lapply(proteinGroups, | |
function(x){ | |
var <- enquo(x) | |
db2 %>% | |
filter(Strain_ID %in% !! var) %>% | |
filter(!is.na(smallMoleculePeaks)) %>% | |
select(Strain_ID, smallMoleculePeaks) %>% | |
collect | |
}) | |
``` | |
```{r} | |
speaks <- lapply(speaks, function(x) split(x$smallMoleculePeaks, x$Strain_ID)) | |
jone <- lapply(speaks, function(x){ | |
lapply(x, function(y) sapply(y, function(z) unserialize(memDecompress(z, type = "gzip"))))}) | |
``` | |
```{r} | |
jone <- lapply(jone, function(zz){ | |
lapply(zz, function(xy){ | |
snrs <- lapply(xy, function(x) x@snr) | |
snrs <- lapply(snrs, function(x) which(x > 6)) | |
for(i in seq_along(snrs)){ | |
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]] | |
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]] | |
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]] | |
} | |
snrs <- lapply(xy, function(x) x@mass) | |
snrs <- lapply(snrs, function(x) which(x > 100)) | |
for(i in seq_along(snrs)){ | |
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]] | |
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]] | |
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]] | |
} | |
snrs <- lapply(xy, function(x) x@mass) | |
snrs <- lapply(snrs, function(x) which(x < 3000)) | |
for(i in seq_along(snrs)){ | |
xy[[i]]@snr <- xy[[i]]@snr[snrs[[i]]] | |
xy[[i]]@mass <- xy[[i]]@mass[snrs[[i]]] | |
xy[[i]]@intensity <- xy[[i]]@intensity[snrs[[i]]] | |
} | |
xy | |
}) | |
}) | |
``` | |
```{r} | |
jones <- lapply(jone, function(zz){ | |
lapply(zz, function(xy){ | |
xy <- MALDIquant::binPeaks(xy, tolerance=.002) | |
xy <- MALDIquant::filterPeaks(xy, minFrequency = .5) | |
MALDIquant::mergeMassPeaks(xy) | |
}) | |
}) | |
``` | |
```{r} | |
masses <- lapply(jones, function(x) lapply(x, function(y) y@mass) ) | |
binVec <- lapply(masses, function(x) binnR(vectorlist = x, | |
ppm = 500, | |
low = 100, | |
high = 3000, | |
increment = .01)) | |
``` | |
```{r} | |
# | |
# pp <- lapply(binVec, function(x) | |
# | |
# if(length(x) < 2){ return("Only one") | |
# }else{ | |
# vegan::diversity(t(do.call(cbind, x))) | |
# } | |
# ) | |
pp <- lapply(binVec, function(x) | |
if(length(x) < 2){ return("Only one") | |
}else{ | |
coop::tcosine(do.call(rbind, x)) | |
} | |
) | |
``` | |
Get labels | |
```{r} | |
binVec2 <- binVec[-which(pp == "Only one")] | |
binVec3 <- binVec[which(pp == "Only one")] | |
onlyone <-unname(unlist(lapply(binVec3, labels))) | |
pp2<-pp | |
pp <- pp2[-which(pp2 == "Only one")] | |
``` | |
___ | |
```{r} | |
hello <- lapply(binVec2,function(i){ | |
a <- i | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
z3 <- as.numeric(z2 > 0) | |
sz3<-sum(z3) | |
prioritized <- NULL | |
captured <- NULL | |
while(length(a) > 1){ | |
zz <- sapply(a, | |
function(x){ | |
y <- z3 - x | |
1 - (sum(y==1) /sz3) | |
}) | |
oo <- order(zz, decreasing = T)[[1]] | |
prioritized <- c(prioritized, names(a)[[oo]]) | |
captured <- c(captured, sort(zz, decreasing = T)[[1]]) | |
z3 <- z3 - a[[oo]] | |
a <- a[-oo] | |
} | |
return(prioritized[captured < .75]) | |
}) | |
q <- c(onlyone, unname(unlist(hello))) | |
write.csv(q, "prioritized.csv") | |
``` | |
```{r} | |
pp<-pp2 | |
``` | |
```{r} | |
for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){ | |
aa <- 1 - as.dist(pp[[i]]) | |
bb <- hclust(aa, "average") | |
dd <- as.dendrogram(bb) | |
dd <- dd %>% set("labels_cex", .7) | |
plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1)) | |
try( | |
dd %>% rect.dendrogram(h=.4, | |
border = 8, lty = 5, lwd = 2) | |
) | |
a <- binVec[[i]] | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
zz <- lapply(seq_along(a), | |
function(x){ | |
z <- do.call(rbind,a[-x]) | |
z <- colSums(z) | |
uniqueVal <- which(a[[x]] > z) | |
lengUniqueVal <- length(uniqueVal) | |
perc1 <- lengUniqueVal / sum(a[[x]]) | |
perc2 <- lengUniqueVal / sum(z2 > 0) | |
return(list(perc1 = perc1, | |
perc2 = perc2)) | |
}) | |
q <- do.call(cbind,zz) | |
q[1,] <- unlist(q[1,])*-1 | |
p <- match(labels(dd), labels(a)) | |
barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red") | |
barplot(unlist(q[2,])[p], add = TRUE, col="blue") | |
a <- binVec[[i]] | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
z3 <- as.numeric(z2 > 0) | |
sz3<-sum(z3) | |
prioritized <- NULL | |
captured <- NULL | |
while(length(a) > 1){ | |
zz <- sapply(a, | |
function(x){ | |
y <- z3 - x | |
1 - (sum(y==1) /sz3) | |
}) | |
oo <- order(zz, decreasing = T)[[1]] | |
prioritized <- c(prioritized, names(a)[[oo]]) | |
captured <- c(captured, sort(zz, decreasing = T)[[1]]) | |
z3 <- z3 - a[[oo]] | |
a <- a[-oo] | |
} | |
plot(captured) | |
text(1:192, captured, prioritized, cex=0.5, pos=4, col="red") | |
} | |
``` | |
```{r} | |
pathpdf <- "C:/Users/chase/Documents/GitHub/twoLittleSponges/data/hplots/1.pdf" | |
pdf(file = pathpdf) | |
for(i in as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 5)) == 1))))){ | |
layout(matrix(c(1,1,1,2,2,2,3,3,3), 3,3, byrow = T)) | |
aa <- 1 - as.dist(pp[[i]]) | |
bb <- hclust(aa, "average") | |
dd <- as.dendrogram(bb) | |
dd <- dd %>% set("labels_cex", .7) | |
plot(hang.dendrogram(dd, hang = 0.1), ylim=c(0,1)) | |
try( | |
dd %>% rect.dendrogram(h=.4, | |
border = 8, lty = 5, lwd = 2) | |
) | |
a <- binVec[[i]] | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
zz <- lapply(seq_along(a), | |
function(x){ | |
z <- do.call(rbind,a[-x]) | |
z <- colSums(z) | |
uniqueVal <- which(a[[x]] > z) | |
lengUniqueVal <- length(uniqueVal) | |
perc1 <- lengUniqueVal / sum(a[[x]]) | |
perc2 <- lengUniqueVal / sum(z2 > 0) | |
return(list(perc1 = perc1, | |
perc2 = perc2)) | |
}) | |
q <- do.call(cbind,zz) | |
q[1,] <- unlist(q[1,])*-1 | |
p <- match(labels(dd), labels(a)) | |
barplot(unlist(q[1,])[p], ylim = c(min(unlist(q)), max(unlist(q))), col = "red") | |
barplot(unlist(q[2,])[p], add = TRUE, col="blue") | |
a <- binVec[[i]] | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
z3 <- as.numeric(z2 > 0) | |
sz3<-sum(z3) | |
prioritized <- NULL | |
captured <- NULL | |
while(length(a) > 1){ | |
zz <- sapply(a, | |
function(x){ | |
y <- z3 - x | |
1 - (sum(y==1) /sz3) | |
}) | |
oo <- order(zz, decreasing = T)[[1]] | |
prioritized <- c(prioritized, names(a)[[oo]]) | |
captured <- c(captured, sort(zz, decreasing = T)[[1]]) | |
z3 <- z3 - a[[oo]] | |
a <- a[-oo] | |
} | |
plot(captured) | |
text(1:192, captured, prioritized, cex=0.5, pos=4, col="red") | |
abline(h=.75, col= "grey") | |
abline(h=.80, col= "grey") | |
abline(h=.85, col= "grey") | |
abline(h=.90, col= "grey") | |
} | |
dev.off() | |
``` | |
```{r} | |
aa <- lapply(as.vector(which(unlist(lapply(pp, function(x) (sum(ncol(x) > 2)) == 1)))), function(x){ | |
a <- binVec[[x]] | |
z2 <- do.call(rbind,a) | |
z2 <- colSums(z2) | |
z3 <- as.numeric(z2 > 0) | |
sz3<-sum(z3) | |
prioritized <- NULL | |
captured <- NULL | |
while(length(a) > 1){ | |
zz <- sapply(a, | |
function(x){ | |
y <- z3 - x | |
1 - (sum(y==1) /sz3) | |
}) | |
oo <- order(zz, decreasing = T)[[1]] | |
prioritized <- c(prioritized, names(a)[[oo]]) | |
captured <- c(captured, sort(zz, decreasing = T)[[1]]) | |
z3 <- z3 - a[[oo]] | |
a <- a[-oo] | |
} | |
list(prioritized =prioritized, captured = captured) | |
}) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment