Skip to content

Instantly share code, notes, and snippets.

@klprint
Last active December 20, 2016 10:20
Show Gist options
  • Save klprint/cabfbb6b97547c85c9dfbb0f8975ac90 to your computer and use it in GitHub Desktop.
Save klprint/cabfbb6b97547c85c9dfbb0f8975ac90 to your computer and use it in GitHub Desktop.
# Analyse data using a sliding window
slideFunct <- function(data, window, step){
total <- length(data)
spots <- seq(from=1, to=(total-window), by=step)
result <- vector(length = length(spots))
for(i in 1:length(spots)){
result[i] <- median(data[spots[i]:(spots[i]+window)])
}
return(result)
}
quickmerge <- function(x, y){
df <- merge(x, y, by= "row.names", all.x= F, all.y= F)
rownames(df) <- df$Row.names
df$Row.names <- NULL
return(df)
}
quickmerge.multi <- function(..., all=F){
dfs <- list(...)
if(all == T){
merged.df <- dfs[[1]]
for(i in 2:length(dfs)){
merged.df <- quickmerge(merged.df, dfs[[i]], all.x=T, all.y=T)
}
}else{
merged.df <- Reduce(quickmerge, dfs)
}
return(merged.df)
}
scalelog2<-function(x=2,g){ #for below diagonal
for (i in 2:x){
for (j in 1:(i-1)) {
g[i,(j)]<-g[i,(j)] + scale_x_continuous(trans='log2') +
scale_y_continuous(trans='log2')
} }
for (i in 1:(x-1)){ #for the bottom row
g[x,i]<-g[x,i] + scale_y_continuous(trans='log2')
}
for (i in 1:x){ #for the diagonal
g[i,i]<-g[i,i]+ scale_x_continuous(trans='log2') }
return(g) }
default.read <- function(table.path,
sep='\t',
r.names = 1,
header = T){
df <- read.table(table.path,
sep=sep,
row.names = r.names,
header = header)
return(df)
}
colsums.as.df <- function(df){
df.cs <- as.data.frame(colSums(df))
colnames(df.cs) <- 'Colsum'
return(df.cs)
}
plot.colsums <- function(df, main = 'Number of reads per sample'){
df.cs <- colsums.as.df(df)
require(ggplot2)
p.plot <- ggplot(df.cs, aes(x=row.names(df.cs), y=Colsum)) +
geom_bar(stat = 'identity') +
ylab('Reads number') +
xlab('Sample') +
ggtitle(main)
return(p.plot)
}
# Order the df using the row.names in alphabetic decreaseing order
order.rownames <- function(df){
df.out <- df[order(row.names(df)), ]
return(df.out)
}
# Summarize reads of different DFs with same Colname
sum.reads <- function(df.x, df.y){
samples <- c(colnames(df.x), colnames(df.y))
samples <- samples[!duplicated(samples)]
df.x <- order.rownames(df.x)
df.y <- order.rownames(df.y)
r.names <- row.names(df.x)
df.out <- NULL
for(sample in samples){
if(sample %in% colnames(df.x) & sample %in% colnames(df.y)){
df.out[[sample]] <- df.x[[sample]] + df.y[[sample]]
}
if(sample %in% colnames(df.x) & !(sample %in% colnames(df.y))){
df.out[[sample]] <- df.x[[sample]]
}
if(sample %in% colnames(df.y) & !(sample %in% colnames(df.x))){
df.out[[sample]] <- df.y[[sample]]
}
}
df.out <- as.data.frame(df.out)
row.names(df.out) <- r.names
return(df.out)
}
replace.na <- function(vec, m=NULL) {
if(is.null(m)){
m <- mean(vec, na.rm = TRUE)
}
vec[is.na(vec)] <- m
return(vec)
}
assign.class <- function(df, ulist){
classes <- NULL
for (gene in row.names(df)){
if (gene %in% row.names(ulist)){
classes <- c(classes, ulist$Class[row.names(ulist) == gene ])
} else {
gene <- strsplit(gene, split = '\\.')[[1]]
# print(gene)
gene <- paste(gene[-(length(gene))], collapse = '')
if (gene %in% row.names(ulist)){
classes <- c(classes, ulist$Class[row.names(ulist) == gene ])
} else {
classes <- c(classes, 'ZUnknown')
}
}
}
return(classes)
}
progress <- function (x, max = 100) {
percent <- x / max * 100
cat(sprintf('\r[%-50s] %d%%',
paste(rep('=', percent / 2), collapse = ''),
floor(percent)))
if (x == max)
cat('\n')
}
count.subset <- function(seq, aa.subset, norm = F, normTo = width(seq)){
in.subset <- 0
for(aa in strsplit(seq, split = '')[[1]]){
if(aa %in% strsplit(aa.subset, split = '')[[1]]){ # test if AA is in subset
in.subset <- in.subset + 1
}
}
if (norm){
return(in.subset / normTo)
} else {
return(in.subset)
}
}
quarter.fun <- function(seq, name, sumarize = NULL, FUN, ...){
qs <- c('Q1', 'Q2', 'Q3', 'Q4')
seq <- strsplit(seq, split = '')[[1]]
seq.app <- c(seq, rep(NA, 4 - (length(seq) %% 4) ))
mat <- matrix(seq.app, nrow = 4)
q1 <- 1
q2 <- ncol(mat)
q3 <- 2* ncol(mat)
q4 <- 3* ncol(mat)
q5 <- length(seq)
seq.quarters <- list( Q1 = paste(seq[q1:q2], collapse = ''),
Q2 = paste(seq[ (q2+1) :q3], collapse =''),
Q3 = paste(seq[ (q3+1) :q4], collapse = ''),
Q4 = paste(seq[ (q4+1) :q5], collapse ='' ))
out <- NULL
for(q.seq in seq.quarters){
out <- c(out, FUN(q.seq, ...))
}
out <- data.frame(matrix(out, nrow = 1))
colnames(out) <- paste(name, qs, sep='')
if(!is.null(sumarize)){
out <- sum( out[sumarize] )
}
return(out)
}
find.patches <- function(seq, aa.subset, min.len = 2,
return.num = T, return.val = '',
normTo = NULL) {
aa.regex <- paste('[', aa.subset, ']', sep = '')
str.regex <- paste(aa.regex,'{', min.len, ',', width(seq) ,'}', sep = '')
# print(str.regex)
t <- gregexpr(str.regex, seq, perl = T)[[1]]
i <- 1
found.patches <- NULL
for(reg.pos in t){
found.patches <- c(found.patches, substr(seq, reg.pos, reg.pos + (attr(t, 'match.length')[i] - 1 ) ) )
i <- i+1
}
names(found.patches) <- NULL
if (return.num) {
if (!is.null(normTo)) {
return(length(found.patches) / normTo)
} else {
return(length(found.patches))
}
if (return.val == 'max'){
return(max( width(d) ))
} else if (return.val == 'mean') {
return(mean( width(d) ))
} else if (return.val == 'median') {
return(median( width(d) ))
}
# Return max patch
}else{
return(found.patches)
}
}
select.features <- function(df.for.WRT) {
start.time <- Sys.time()
print(start.time)
comp.mat <- list()
k <- 2
for (i in 1:ncol(df.for.WRT)){
temp.pval <- NULL
for(j in k:ncol(df.for.WRT)){
wr.test <- wilcox.test(df.for.WRT[,i], df.for.WRT[,j], paired = T, exact = F)
temp.pval <- c(temp.pval, wr.test$p.value)
}
temp.pval <- c(rep(NaN, k-1), temp.pval)
comp.mat[[colnames(df.for.WRT)[i]]] <- temp.pval
if(k < ncol(df.for.WRT)){
k <- k+1
}else{
k <- ncol(df.for.WRT)
}
progress(x = i, max = ncol(df.for.WRT))
}
finish.time <- Sys.time()
print(finish.time)
print(paste('Elapsed time: ', start.time - finish.time, sep = ''))
return(comp.mat)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment