Last active
November 3, 2015 15:07
-
-
Save explodecomputer/1300138859cb02b9328a to your computer and use it in GitHub Desktop.
k fold cv ewas example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
arguments <- commandArgs(T) | |
k <- as.numeric(arguments[1]) | |
threshold <- as.numeric(arguments[2]) | |
nfold <- as.numeric(arguments[3]) | |
aries.release.dir <- "/panfs/panasas01/shared/alspac/deprecated/aries/releasev2_apr2014/" | |
time.point <- "cord" ## "FOM","antenatal","cord","F7","15up" | |
## load methylation data | |
load(file.path(aries.release.dir, paste("ALN.tost.betas", time.point, "releasev2_apr2014.Rdata", sep="."))) | |
## load estimated cell counts | |
load(file.path(aries.release.dir, paste("ALN.houseman450kData", time.point, "releasev2_apr2014.Rdata", sep="."))) | |
cell.counts <- houseman450kData$counts | |
stopifnot(rownames(cell.counts) == colnames(tbetas)) | |
rm(houseman450kData) | |
## load sample information | |
manifest <- read.table(file.path(aries.release.dir, 'ARIES.manifest.releasev2_apr2014.txt'), sep=',',header=TRUE) | |
manifest <- manifest[which(manifest$time_point == time.point),] | |
manifest <- manifest[match(colnames(tbetas), manifest$ALN),] | |
## identify probes that do not cross-hybridize and are unlikely to be affected by genetic varation | |
good.probes <- read.csv(file.path(aries.release.dir, "naeemProbeInfo.csv"),check.names=F) | |
good.probes <- good.probes[which(good.probes[,"Flag(discard/keep)"] == "keep"),] | |
tbetas <- tbetas[match(good.probes$probe, rownames(tbetas)),] | |
library(CpGassoc) | |
## independent variable is gender | |
indep <- sign(manifest$gender == "m") | |
## covariates are cell count estimates | |
covariates <- cell.counts | |
nid <- ncol(tbetas) | |
chunksize <- ceiling(nid/nfold) | |
s1 <- seq(1, nid, by=nid/nfold) | |
s2 <- s1 + nid/nfold - 1 | |
index <- s1[k] : s2[k] | |
tbetas_train <- tbetas[, -index] | |
## MWAS | |
ret <- cpg.assoc(tbetas[, -index], | |
indep=indep[-index], | |
covariates=covariates[-index, ]) | |
## statistic for each probe | |
results <- ret$results | |
results$effect.size <- ret$coefficients$effect.size | |
results$std.error <- ret$coefficients$std.error | |
## probes with the 1000 strongest p-values | |
# top1000 <- results[order(results$P.value, decreasing=F)[1:1000],] | |
results <- subset(results, P.value < threshold) | |
# Now make predictor in only the 10% who weren't in the training set | |
# Check this bit... | |
tbetas_test <- tbteas[results$CPG, index] | |
predictor <- t(tbetas_test) %*% results$effect.size | |
# Now test association of predictor | |
read in height data | |
make sure height IDs match with tbetas IDs | |
extract just 'index' height IDs | |
perform regression / correlation of predictor vs height phenotype | |
save these results | |
save(results, ..., etc, etc, file=paste0("results", k, ".RData")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#PBS -N test | |
#PBS -o test-output | |
#PBS -e test-error | |
#PBS -t 1-500 | |
#PBS -l walltime=5:00:00 | |
#PBS -l nodes=1:ppn=1 | |
#PBS -S /bin/bash | |
set -e | |
echo "Running on ${HOSTNAME}" | |
# If there is an argument passed at commandline then set PBS_ARRAYID as that | |
# Else, PBS_ARRAYID is either empty if run in commandline, | |
# or set by the job scheduler if run by qsub | |
if [ -n "${1}" ]; then | |
echo "${1}" | |
PBS_ARRAYID=${1} | |
fi | |
i=${PBS_ARRAYID} | |
# Analysis | |
R --no-save --args ${i} 1e-4 10 < ewas.R |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment