Skip to content

Instantly share code, notes, and snippets.

@alexpreynolds
Created May 28, 2020 21:33
Show Gist options
  • Save alexpreynolds/644bc120e78874822da89e52169a866a to your computer and use it in GitHub Desktop.
Save alexpreynolds/644bc120e78874822da89e52169a866a to your computer and use it in GitHub Desktop.
Compute the Pearson correlation of two BED files (raw text or archived/compressed)
#! /usr/bin/env Rscript
# -*- mode: R -*-
# Above tells emacs to go into R mode for editing (must be second line).
##
## chromCor:
##
## Compute correlation of two jarched bed files along a chromosome (default chr19).
##
## To run this script, turn the execute bit on and run just as any other script file. The Rscript
## command must be in your path. Rscript comes with R distributions 2.8 (?) and higher, and is in the
## same directory as the R script itself (eg, /usr/local/R/bin).
##
## Author: RET
## Date: 20 Aug 2009
usage <- paste('Usage: chromCor [-c <chromosome>] [-n <column number>] [-z] [-u] <bed1.jarch> <bed2.jarch>',
'',
'Compute correlation of two bed files along a chromosome (default chr19).',
'',
'By default, <bed1.jarch> and <bed2.jarch> are jarched 5-column bed files, with score to be correlated in the',
'fifth column. There must be the same number of entries for the desired chromosome in each bed file.',
'',
'<chromosome> is a chromosome name; by default chr19, or "all", for all entries in the bed file.',
'',
'<column number> is the column in each file containing the data to correlate; by default, 5.',
'',
'-s specifies that bed files are starched instead of jarched.',
'',
'-z specifies that bed files are gzipped instead of jarched.',
'',
'-u specifies that bed files are uncompressed (not jarched or gzipped).',
'',
'Expects gchr and Rscript to be in the user\'s path.',
'',
'Writes results to stdout.',
'',
'\n',
sep = '\n')
args <- commandArgs(T)
if(length(args) <= 1){
cat(usage)
quit('no')
}
chr <- 'chr19'
col <- 5
uncompr <- c('', '')
i <- 1
while(substr(args[i], 1, 1) == '-'){
opt <- substr(args[i], 2, 2)
if(opt == 'c'){
chr <- args[i+1]
if(chr == 'all') chr = ''
i <- i + 2
}else if(opt == 'n'){
col <- as.integer(args[i+1])
i <- i + 2
}else if(opt == 'z'){
uncompr <- c('zcat', 'zcat')
i <- i + 1
}else if(opt == 's'){
uncompr <- c('unstarch', 'unstarch')
i <- i + 1
}else if(opt == 'u'){
uncompr <- c('cat', 'cat')
i <- i + 1
}else{
cat('Unrecognized option:', args[i], '\n', file = stderr())
cat(usage, file = stderr())
quit('no')
}
}
bed <- args[c(i, i+1)]
rds <- list()
for(i in 1:2){
if(uncompr[i] == ''){
ext <- gsub('.*\\.', '', bed[i])
if( ext == "bed"){
uncompr[i] <- 'cat'
} else if( ext == "jarch"){
uncompr[i] <- 'gchr'
} else if( ext == "starch"){
uncompr[i] <- 'unstarch'
} else if( ext == "gz" || ext == "zip"){
uncompr[i] <- 'zcat'
} else{
cat('Unrecognized file extension:', bed[i], '\n', file = stderr())
quit('no')
}
}
if(chr == '')
cmd <- paste(uncompr[i], '%BED')
else{
if(uncompr[i] == "cat" || uncompr[i] == "zcat"){
cmd <- paste(uncompr[i], ' %BED | grep "^', chr, '" -', sep = '')
}else if(uncompr[i] == "unstarch" || uncompr[i] == "gchr"){
cmd <- paste(uncompr[i], chr, '%BED')
}
}
cmd <- paste(cmd, ' | cut -f', col, sep = '')
cat('Reading', bed[i], '...\n', file = stderr())
con <- pipe(gsub('%BED', bed[i], cmd))
rds[[i]] <- scan(con, what = numeric(0))
close(con)
}
cv <- cor(rds[[1]], rds[[2]])
cat(cv, '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment