Created
May 28, 2020 21:33
-
-
Save alexpreynolds/644bc120e78874822da89e52169a866a to your computer and use it in GitHub Desktop.
Compute the Pearson correlation of two BED files (raw text or archived/compressed)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env Rscript | |
# -*- mode: R -*- | |
# Above tells emacs to go into R mode for editing (must be second line). | |
## | |
## chromCor: | |
## | |
## Compute correlation of two jarched bed files along a chromosome (default chr19). | |
## | |
## To run this script, turn the execute bit on and run just as any other script file. The Rscript | |
## command must be in your path. Rscript comes with R distributions 2.8 (?) and higher, and is in the | |
## same directory as the R script itself (eg, /usr/local/R/bin). | |
## | |
## Author: RET | |
## Date: 20 Aug 2009 | |
usage <- paste('Usage: chromCor [-c <chromosome>] [-n <column number>] [-z] [-u] <bed1.jarch> <bed2.jarch>', | |
'', | |
'Compute correlation of two bed files along a chromosome (default chr19).', | |
'', | |
'By default, <bed1.jarch> and <bed2.jarch> are jarched 5-column bed files, with score to be correlated in the', | |
'fifth column. There must be the same number of entries for the desired chromosome in each bed file.', | |
'', | |
'<chromosome> is a chromosome name; by default chr19, or "all", for all entries in the bed file.', | |
'', | |
'<column number> is the column in each file containing the data to correlate; by default, 5.', | |
'', | |
'-s specifies that bed files are starched instead of jarched.', | |
'', | |
'-z specifies that bed files are gzipped instead of jarched.', | |
'', | |
'-u specifies that bed files are uncompressed (not jarched or gzipped).', | |
'', | |
'Expects gchr and Rscript to be in the user\'s path.', | |
'', | |
'Writes results to stdout.', | |
'', | |
'\n', | |
sep = '\n') | |
args <- commandArgs(T) | |
if(length(args) <= 1){ | |
cat(usage) | |
quit('no') | |
} | |
chr <- 'chr19' | |
col <- 5 | |
uncompr <- c('', '') | |
i <- 1 | |
while(substr(args[i], 1, 1) == '-'){ | |
opt <- substr(args[i], 2, 2) | |
if(opt == 'c'){ | |
chr <- args[i+1] | |
if(chr == 'all') chr = '' | |
i <- i + 2 | |
}else if(opt == 'n'){ | |
col <- as.integer(args[i+1]) | |
i <- i + 2 | |
}else if(opt == 'z'){ | |
uncompr <- c('zcat', 'zcat') | |
i <- i + 1 | |
}else if(opt == 's'){ | |
uncompr <- c('unstarch', 'unstarch') | |
i <- i + 1 | |
}else if(opt == 'u'){ | |
uncompr <- c('cat', 'cat') | |
i <- i + 1 | |
}else{ | |
cat('Unrecognized option:', args[i], '\n', file = stderr()) | |
cat(usage, file = stderr()) | |
quit('no') | |
} | |
} | |
bed <- args[c(i, i+1)] | |
rds <- list() | |
for(i in 1:2){ | |
if(uncompr[i] == ''){ | |
ext <- gsub('.*\\.', '', bed[i]) | |
if( ext == "bed"){ | |
uncompr[i] <- 'cat' | |
} else if( ext == "jarch"){ | |
uncompr[i] <- 'gchr' | |
} else if( ext == "starch"){ | |
uncompr[i] <- 'unstarch' | |
} else if( ext == "gz" || ext == "zip"){ | |
uncompr[i] <- 'zcat' | |
} else{ | |
cat('Unrecognized file extension:', bed[i], '\n', file = stderr()) | |
quit('no') | |
} | |
} | |
if(chr == '') | |
cmd <- paste(uncompr[i], '%BED') | |
else{ | |
if(uncompr[i] == "cat" || uncompr[i] == "zcat"){ | |
cmd <- paste(uncompr[i], ' %BED | grep "^', chr, '" -', sep = '') | |
}else if(uncompr[i] == "unstarch" || uncompr[i] == "gchr"){ | |
cmd <- paste(uncompr[i], chr, '%BED') | |
} | |
} | |
cmd <- paste(cmd, ' | cut -f', col, sep = '') | |
cat('Reading', bed[i], '...\n', file = stderr()) | |
con <- pipe(gsub('%BED', bed[i], cmd)) | |
rds[[i]] <- scan(con, what = numeric(0)) | |
close(con) | |
} | |
cv <- cor(rds[[1]], rds[[2]]) | |
cat(cv, '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment