Created
March 18, 2011 22:02
-
-
Save stephenturner/876932 to your computer and use it in GitHub Desktop.
forlani.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#----------------------------------------------------------------------------------------------- | |
# | |
# Calculate PCs when the number of SNPs (m>4000) is large | |
# From G with SNPs on the columns, keep the first 20 PCs | |
# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix) | |
# X'X can't be handled directly. | |
# INSTEAD, | |
# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X | |
# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want | |
# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant | |
# THIS IS WHAT the EIGENSTRAT is about. BUT they never said it implicitly. | |
# For NAs: | |
# SAME as EIGENSTRAT, set NA to 0 in normalized G, so they don't contribute to PC | |
# | |
#----------------------------------------------------------------------------------------------- | |
pca1<-function(G, kEv=20) { | |
X = as.matrix(G) # required for using %*% | |
rm(G); gc() | |
# first normalize then set NA to 0 | |
for (i in 1:ncol(X)) { # do it for each SNP | |
p<-mean(X[,i], na.rm=T)/2 | |
X[,i]=(X[,i] - 2*p)/sqrt(2*p*(1-p)) # p is a scalar | |
X[,i][ is.na(X[,i]) ] = 0 | |
} | |
# XXprime = X %*% t(X) # too much memory for large X | |
XXprime = array(NA, dim = c(nrow(X), nrow(X))) | |
for (i in 1: nrow(X)) for (j in 1:nrow(X)) { | |
XXprime[i,j] = sum(X[i,]*X[j,]) | |
if (i > j ) XXprime[j,i ] = XXprime[i,j] | |
} | |
rm(X); gc() | |
ev = eigen(XXprime) | |
e.values = formatC(ev$value/sum(ev$value)*100, digits=2, format='f')[1:kEv] # output % of total variation | |
list (ev$vectors[,1:kEv], e.values) | |
} | |
#----------------------------------------------------------------------------------------------- | |
#----------------------------------------------------------------------------------------------- | |
# | |
# Try this other pca function when the number of markers is smaller than | |
# the number of subjects. You have to use the NormData function to | |
# normalize the data first --- if to be the same as what EigenStrat does. | |
# But the results should not be a lot different with or without | |
# normalization. As long as the memory is OK, this should not take more | |
# than half day. | |
# | |
#----------------------------------------------------------------------------------------------- | |
# Calculate Normalized G matrix (NormData) | |
# SNPs on *** COLUMN *** with 0,1,2, NA | |
#----------------------------------------------------------------------------------------------- | |
NormData <- function(G){ | |
for (i in 1:ncol(G)) { # repeat for each SNP | |
p<-mean(G[,i], na.rm=T)/2 | |
G[,i]=(G[,i] - 2*p)/sqrt(2*p*(1-p)) | |
} | |
G | |
} | |
#----------------------------------------------------------------------------------------------- | |
# Direct calculation of covariance among m SNPs, where m < n | |
# EVs will be used to form PCs for normG | |
# SNP on ** Column ** | |
#----------------------------------------------------------------------------------------------- | |
pca2<-function(NormG, kEv=20) { | |
X = as.matrix(NormG) # for matrix operation | |
n=nrow(X) | |
K = cov(X, use="pairwise.complete.obs") # cov gives covariance for columns | |
full = eigen(K) | |
EVs = full$vec[,1:kEv]; value = full$values | |
rm (full, K); | |
PCs = array(NA, dim=c(n,kEv)) | |
for (i in 1:n) for (j in 1:kEv) { | |
t = X[i,]*EVs[,j] # elementwise product within vectors | |
PCs[i,j] = mean(t,na.rm=T) | |
} | |
e.values = formatC(value/sum(value)*100, digits=2, format='f')[1:kEv] # output % of total variation | |
list (PCs, e.values) | |
} | |
#----------------------------------------------------------------------------------------------- | |
d <- read.table("N:/EPI/turnersd/4kforpca.txt.gz",T) | |
d <- NormData(d) | |
pcs <- pca2(d) | |
save.image("N:/EPI/turnersd/pcs-1-with-d.RData") | |
rm(d) | |
save.image("N:/EPI/turnersd/pcs-2-without-d.RData") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment