Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created March 18, 2011 22:02
Show Gist options
  • Save stephenturner/876932 to your computer and use it in GitHub Desktop.
Save stephenturner/876932 to your computer and use it in GitHub Desktop.
forlani.r
#-----------------------------------------------------------------------------------------------
#
# Calculate PCs when the number of SNPs (m>4000) is large
# From G with SNPs on the columns, keep the first 20 PCs
# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
# X'X can't be handled directly.
# INSTEAD,
# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant
# THIS IS WHAT the EIGENSTRAT is about. BUT they never said it implicitly.
# For NAs:
# SAME as EIGENSTRAT, set NA to 0 in normalized G, so they don't contribute to PC
#
#-----------------------------------------------------------------------------------------------
pca1<-function(G, kEv=20) {
X = as.matrix(G) # required for using %*%
rm(G); gc()
# first normalize then set NA to 0
for (i in 1:ncol(X)) { # do it for each SNP
p<-mean(X[,i], na.rm=T)/2
X[,i]=(X[,i] - 2*p)/sqrt(2*p*(1-p)) # p is a scalar
X[,i][ is.na(X[,i]) ] = 0
}
# XXprime = X %*% t(X) # too much memory for large X
XXprime = array(NA, dim = c(nrow(X), nrow(X)))
for (i in 1: nrow(X)) for (j in 1:nrow(X)) {
XXprime[i,j] = sum(X[i,]*X[j,])
if (i > j ) XXprime[j,i ] = XXprime[i,j]
}
rm(X); gc()
ev = eigen(XXprime)
e.values = formatC(ev$value/sum(ev$value)*100, digits=2, format='f')[1:kEv] # output % of total variation
list (ev$vectors[,1:kEv], e.values)
}
#-----------------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------
#
# Try this other pca function when the number of markers is smaller than
# the number of subjects. You have to use the NormData function to
# normalize the data first --- if to be the same as what EigenStrat does.
# But the results should not be a lot different with or without
# normalization. As long as the memory is OK, this should not take more
# than half day.
#
#-----------------------------------------------------------------------------------------------
# Calculate Normalized G matrix (NormData)
# SNPs on *** COLUMN *** with 0,1,2, NA
#-----------------------------------------------------------------------------------------------
NormData <- function(G){
for (i in 1:ncol(G)) { # repeat for each SNP
p<-mean(G[,i], na.rm=T)/2
G[,i]=(G[,i] - 2*p)/sqrt(2*p*(1-p))
}
G
}
#-----------------------------------------------------------------------------------------------
# Direct calculation of covariance among m SNPs, where m < n
# EVs will be used to form PCs for normG
# SNP on ** Column **
#-----------------------------------------------------------------------------------------------
pca2<-function(NormG, kEv=20) {
X = as.matrix(NormG) # for matrix operation
n=nrow(X)
K = cov(X, use="pairwise.complete.obs") # cov gives covariance for columns
full = eigen(K)
EVs = full$vec[,1:kEv]; value = full$values
rm (full, K);
PCs = array(NA, dim=c(n,kEv))
for (i in 1:n) for (j in 1:kEv) {
t = X[i,]*EVs[,j] # elementwise product within vectors
PCs[i,j] = mean(t,na.rm=T)
}
e.values = formatC(value/sum(value)*100, digits=2, format='f')[1:kEv] # output % of total variation
list (PCs, e.values)
}
#-----------------------------------------------------------------------------------------------
d <- read.table("N:/EPI/turnersd/4kforpca.txt.gz",T)
d <- NormData(d)
pcs <- pca2(d)
save.image("N:/EPI/turnersd/pcs-1-with-d.RData")
rm(d)
save.image("N:/EPI/turnersd/pcs-2-without-d.RData")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment