Skip to content

Instantly share code, notes, and snippets.

@explodecomputer
Last active November 3, 2015 15:07
Show Gist options
  • Save explodecomputer/1300138859cb02b9328a to your computer and use it in GitHub Desktop.
Save explodecomputer/1300138859cb02b9328a to your computer and use it in GitHub Desktop.
k fold cv ewas example
arguments <- commandArgs(T)
k <- as.numeric(arguments[1])
threshold <- as.numeric(arguments[2])
nfold <- as.numeric(arguments[3])
aries.release.dir <- "/panfs/panasas01/shared/alspac/deprecated/aries/releasev2_apr2014/"
time.point <- "cord" ## "FOM","antenatal","cord","F7","15up"
## load methylation data
load(file.path(aries.release.dir, paste("ALN.tost.betas", time.point, "releasev2_apr2014.Rdata", sep=".")))
## load estimated cell counts
load(file.path(aries.release.dir, paste("ALN.houseman450kData", time.point, "releasev2_apr2014.Rdata", sep=".")))
cell.counts <- houseman450kData$counts
stopifnot(rownames(cell.counts) == colnames(tbetas))
rm(houseman450kData)
## load sample information
manifest <- read.table(file.path(aries.release.dir, 'ARIES.manifest.releasev2_apr2014.txt'), sep=',',header=TRUE)
manifest <- manifest[which(manifest$time_point == time.point),]
manifest <- manifest[match(colnames(tbetas), manifest$ALN),]
## identify probes that do not cross-hybridize and are unlikely to be affected by genetic varation
good.probes <- read.csv(file.path(aries.release.dir, "naeemProbeInfo.csv"),check.names=F)
good.probes <- good.probes[which(good.probes[,"Flag(discard/keep)"] == "keep"),]
tbetas <- tbetas[match(good.probes$probe, rownames(tbetas)),]
library(CpGassoc)
## independent variable is gender
indep <- sign(manifest$gender == "m")
## covariates are cell count estimates
covariates <- cell.counts
nid <- ncol(tbetas)
chunksize <- ceiling(nid/nfold)
s1 <- seq(1, nid, by=nid/nfold)
s2 <- s1 + nid/nfold - 1
index <- s1[k] : s2[k]
tbetas_train <- tbetas[, -index]
## MWAS
ret <- cpg.assoc(tbetas[, -index],
indep=indep[-index],
covariates=covariates[-index, ])
## statistic for each probe
results <- ret$results
results$effect.size <- ret$coefficients$effect.size
results$std.error <- ret$coefficients$std.error
## probes with the 1000 strongest p-values
# top1000 <- results[order(results$P.value, decreasing=F)[1:1000],]
results <- subset(results, P.value < threshold)
# Now make predictor in only the 10% who weren't in the training set
# Check this bit...
tbetas_test <- tbteas[results$CPG, index]
predictor <- t(tbetas_test) %*% results$effect.size
# Now test association of predictor
read in height data
make sure height IDs match with tbetas IDs
extract just 'index' height IDs
perform regression / correlation of predictor vs height phenotype
save these results
save(results, ..., etc, etc, file=paste0("results", k, ".RData"))
#!/bin/bash
#PBS -N test
#PBS -o test-output
#PBS -e test-error
#PBS -t 1-500
#PBS -l walltime=5:00:00
#PBS -l nodes=1:ppn=1
#PBS -S /bin/bash
set -e
echo "Running on ${HOSTNAME}"
# If there is an argument passed at commandline then set PBS_ARRAYID as that
# Else, PBS_ARRAYID is either empty if run in commandline,
# or set by the job scheduler if run by qsub
if [ -n "${1}" ]; then
echo "${1}"
PBS_ARRAYID=${1}
fi
i=${PBS_ARRAYID}
# Analysis
R --no-save --args ${i} 1e-4 10 < ewas.R
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment