explodecomputer · November 3, 2015 15:07
diff --git a/ewas.R b/ewas.R

 arguments <- commandArgs(T)
 k <- as.numeric(arguments[1])
 threshold <- as.numeric(arguments[2])
 nfold <- as.numeric(arguments[3])

 aries.release.dir <- "/panfs/panasas01/shared/alspac/deprecated/aries/releasev2_apr2014/"

 time.point <- "cord" ## "FOM","antenatal","cord","F7","15up"

 ## load methylation data
 load(file.path(aries.release.dir, paste("ALN.tost.betas", time.point, "releasev2_apr2014.Rdata", sep=".")))

 ## load estimated cell counts
 load(file.path(aries.release.dir, paste("ALN.houseman450kData", time.point, "releasev2_apr2014.Rdata", sep=".")))
 cell.counts <- houseman450kData$counts
 stopifnot(rownames(cell.counts) == colnames(tbetas))
 rm(houseman450kData)

 ## load sample information
 manifest <- read.table(file.path(aries.release.dir, 'ARIES.manifest.releasev2_apr2014.txt'), sep=',',header=TRUE)
 manifest <- manifest[which(manifest$time_point == time.point),]
 manifest <- manifest[match(colnames(tbetas), manifest$ALN),]

 ## identify probes that do not cross-hybridize and are unlikely to be affected by genetic varation
 good.probes <- read.csv(file.path(aries.release.dir, "naeemProbeInfo.csv"),check.names=F)
 good.probes <- good.probes[which(good.probes[,"Flag(discard/keep)"] == "keep"),]

 tbetas <- tbetas[match(good.probes$probe, rownames(tbetas)),]

 library(CpGassoc)

 ## independent variable is gender
 indep <- sign(manifest$gender == "m")

 ## covariates are cell count estimates
 covariates <- cell.counts



 nid <- ncol(tbetas)
 chunksize <- ceiling(nid/nfold)

 s1 <- seq(1, nid, by=nid/nfold)
 s2 <- s1 + nid/nfold - 1

 index <- s1[k] : s2[k]

 tbetas_train <- tbetas[, -index]




 ## MWAS
 ret <- cpg.assoc(tbetas[, -index],
                 indep=indep[-index],
                 covariates=covariates[-index, ])

 ## statistic for each probe
 results <- ret$results
 results$effect.size <- ret$coefficients$effect.size
 results$std.error <- ret$coefficients$std.error

 ## probes with the 1000 strongest p-values
 # top1000 <- results[order(results$P.value, decreasing=F)[1:1000],]

 results <- subset(results, P.value < threshold)



 # Now make predictor in only the 10% who weren't in the training set

 # Check this bit...
 tbetas_test <- tbteas[results$CPG, index] 

 predictor <- t(tbetas_test) %*% results$effect.size


 # Now test association of predictor 

 read in height data
 make sure height IDs match with tbetas IDs
 extract just 'index' height IDs
 perform regression / correlation of predictor vs height phenotype

 save these results

 save(results, ..., etc, etc, file=paste0("results", k, ".RData"))
diff --git a/submit.sh b/submit.sh
 #!/bin/bash

 #PBS -N test
 #PBS -o test-output
 #PBS -e test-error
 #PBS -t 1-500
 #PBS -l walltime=5:00:00
 #PBS -l nodes=1:ppn=1
 #PBS -S /bin/bash

 set -e

 echo "Running on ${HOSTNAME}"

 # If there is an argument passed at commandline then set PBS_ARRAYID as that
 # Else, PBS_ARRAYID is either empty if run in commandline, 
 # or set by the job scheduler if run by qsub
 if [ -n "${1}" ]; then
  echo "${1}"
  PBS_ARRAYID=${1}
 fi

 i=${PBS_ARRAYID}

 # Analysis
 R --no-save --args ${i} 1e-4 10 < ewas.R

	arguments <- commandArgs(T)
	k <- as.numeric(arguments[1])
	threshold <- as.numeric(arguments[2])
	nfold <- as.numeric(arguments[3])

	aries.release.dir <- "/panfs/panasas01/shared/alspac/deprecated/aries/releasev2_apr2014/"

	time.point <- "cord" ## "FOM","antenatal","cord","F7","15up"

	## load methylation data
	load(file.path(aries.release.dir, paste("ALN.tost.betas", time.point, "releasev2_apr2014.Rdata", sep=".")))

	## load estimated cell counts
	load(file.path(aries.release.dir, paste("ALN.houseman450kData", time.point, "releasev2_apr2014.Rdata", sep=".")))
	cell.counts <- houseman450kData$counts
	stopifnot(rownames(cell.counts) == colnames(tbetas))
	rm(houseman450kData)

	## load sample information
	manifest <- read.table(file.path(aries.release.dir, 'ARIES.manifest.releasev2_apr2014.txt'), sep=',',header=TRUE)
	manifest <- manifest[which(manifest$time_point == time.point),]
	manifest <- manifest[match(colnames(tbetas), manifest$ALN),]

	## identify probes that do not cross-hybridize and are unlikely to be affected by genetic varation
	good.probes <- read.csv(file.path(aries.release.dir, "naeemProbeInfo.csv"),check.names=F)
	good.probes <- good.probes[which(good.probes[,"Flag(discard/keep)"] == "keep"),]

	tbetas <- tbetas[match(good.probes$probe, rownames(tbetas)),]

	library(CpGassoc)

	## independent variable is gender
	indep <- sign(manifest$gender == "m")

	## covariates are cell count estimates
	covariates <- cell.counts



	nid <- ncol(tbetas)
	chunksize <- ceiling(nid/nfold)

	s1 <- seq(1, nid, by=nid/nfold)
	s2 <- s1 + nid/nfold - 1

	index <- s1[k] : s2[k]

	tbetas_train <- tbetas[, -index]




	## MWAS
	ret <- cpg.assoc(tbetas[, -index],
	indep=indep[-index],
	covariates=covariates[-index, ])

	## statistic for each probe
	results <- ret$results
	results$effect.size <- ret$coefficients$effect.size
	results$std.error <- ret$coefficients$std.error

	## probes with the 1000 strongest p-values
	# top1000 <- results[order(results$P.value, decreasing=F)[1:1000],]

	results <- subset(results, P.value < threshold)



	# Now make predictor in only the 10% who weren't in the training set

	# Check this bit...
	tbetas_test <- tbteas[results$CPG, index]

	predictor <- t(tbetas_test) %*% results$effect.size


	# Now test association of predictor

	read in height data
	make sure height IDs match with tbetas IDs
	extract just 'index' height IDs
	perform regression / correlation of predictor vs height phenotype

	save these results

	save(results, ..., etc, etc, file=paste0("results", k, ".RData"))
	#!/bin/bash

	#PBS -N test
	#PBS -o test-output
	#PBS -e test-error
	#PBS -t 1-500
	#PBS -l walltime=5:00:00
	#PBS -l nodes=1:ppn=1
	#PBS -S /bin/bash

	set -e

	echo "Running on ${HOSTNAME}"

	# If there is an argument passed at commandline then set PBS_ARRAYID as that
	# Else, PBS_ARRAYID is either empty if run in commandline,
	# or set by the job scheduler if run by qsub
	if [ -n "${1}" ]; then
	echo "${1}"
	PBS_ARRAYID=${1}
	fi

	i=${PBS_ARRAYID}

	# Analysis
	R --no-save --args ${i} 1e-4 10 < ewas.R