stephenturner · March 18, 2011 22:02
diff --git a/forlani.r b/forlani.r
 #-----------------------------------------------------------------------------------------------
 #
 #  	Calculate PCs when the number of SNPs (m>4000) is large
 #  	From G with SNPs on the columns, keep the first 20 PCs
 #	X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
 #	X'X can't be handled directly. 
 #	INSTEAD,
 #	     	If XX'v= lv, X'XX'v = lX'v  => if find eigenv for XX' then X'v is eigenv for X'X
 #		SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
 #			OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant 
 #		THIS IS WHAT the EIGENSTRAT is about. BUT they never said it implicitly.
 #	For NAs:
 #		SAME as EIGENSTRAT, set NA to 0 in normalized G, so they don't contribute to PC
 #
 #-----------------------------------------------------------------------------------------------
 pca1<-function(G, kEv=20) {

 	X = as.matrix(G)						# required for using %*% 
 	rm(G); gc()

 	# first normalize then set NA to 0
 
 	for (i in 1:ncol(X)) {					# do it for each SNP
 		p<-mean(X[,i], na.rm=T)/2			
 		X[,i]=(X[,i] - 2*p)/sqrt(2*p*(1-p))  	# p is a scalar
 		X[,i][ is.na(X[,i]) ] = 0
  }
 						
 	# XXprime = X %*% t(X)					#	too much memory for large X

 	XXprime = array(NA, dim = c(nrow(X), nrow(X)))  
 	
 	for (i in 1: nrow(X)) for (j in 1:nrow(X)) {
 		XXprime[i,j] = sum(X[i,]*X[j,])
 		if (i > j ) XXprime[j,i ]	= XXprime[i,j]     	
  }

 	rm(X); gc()

 ev = eigen(XXprime)	
 	e.values = formatC(ev$value/sum(ev$value)*100, digits=2, format='f')[1:kEv]		# output % of total variation
 	list (ev$vectors[,1:kEv], e.values)				
 }
 #-----------------------------------------------------------------------------------------------


 #-----------------------------------------------------------------------------------------------
 #
 # Try this other pca function when the number of markers is smaller than 
 # the number of subjects. You have to use the NormData function to 
 # normalize the data first --- if to be the same as what EigenStrat does. 
 # But the results should not be a lot different with or without 
 # normalization. As long as the memory is OK, this should not take more 
 # than half day.
 #
 #-----------------------------------------------------------------------------------------------
 #  Calculate Normalized G matrix (NormData)
 #  SNPs on ***  COLUMN *** with 0,1,2, NA
 #-----------------------------------------------------------------------------------------------
 NormData <- function(G){
  for (i in 1:ncol(G)) {   # repeat for each SNP
 		p<-mean(G[,i], na.rm=T)/2
 		G[,i]=(G[,i] - 2*p)/sqrt(2*p*(1-p))
 	}
 	G
 }
 #-----------------------------------------------------------------------------------------------
 # Direct calculation of covariance among m SNPs, where m < n
 #  EVs will be used to form PCs for normG
 #  SNP on ** Column **
 #-----------------------------------------------------------------------------------------------
 pca2<-function(NormG, kEv=20) { 
 	X = as.matrix(NormG)     # for matrix operation
 	n=nrow(X)
 	
 	K = cov(X, use="pairwise.complete.obs")  #  cov gives covariance for columns
 	full = eigen(K)
 	EVs = full$vec[,1:kEv];   value = full$values  
 	rm (full, K);
 	
 	PCs = array(NA, dim=c(n,kEv))  
 	for (i in 1:n) for (j in 1:kEv) {
 		t = X[i,]*EVs[,j]   # elementwise product within vectors
 		PCs[i,j] = mean(t,na.rm=T) 
 	}
 	
 	e.values = formatC(value/sum(value)*100, digits=2, format='f')[1:kEv]  # output % of total variation
 	list (PCs, e.values) 
 }
 #-----------------------------------------------------------------------------------------------

 d <- read.table("N:/EPI/turnersd/4kforpca.txt.gz",T)
 d <- NormData(d)
 pcs <- pca2(d)
 save.image("N:/EPI/turnersd/pcs-1-with-d.RData")
 rm(d)
 save.image("N:/EPI/turnersd/pcs-2-without-d.RData")
	#-----------------------------------------------------------------------------------------------
	#
	# Calculate PCs when the number of SNPs (m>4000) is large
	# From G with SNPs on the columns, keep the first 20 PCs
	# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
	# X'X can't be handled directly.
	# INSTEAD,
	# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
	# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
	# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant
	# THIS IS WHAT the EIGENSTRAT is about. BUT they never said it implicitly.
	# For NAs:
	# SAME as EIGENSTRAT, set NA to 0 in normalized G, so they don't contribute to PC
	#
	#-----------------------------------------------------------------------------------------------
	pca1<-function(G, kEv=20) {

	X = as.matrix(G) # required for using %*%
	rm(G); gc()

	# first normalize then set NA to 0

	for (i in 1:ncol(X)) { # do it for each SNP
	p<-mean(X[,i], na.rm=T)/2
	X[,i]=(X[,i] - 2p)/sqrt(2p*(1-p)) # p is a scalar
	X[,i][ is.na(X[,i]) ] = 0
	}

	# XXprime = X %*% t(X) # too much memory for large X

	XXprime = array(NA, dim = c(nrow(X), nrow(X)))

	for (i in 1: nrow(X)) for (j in 1:nrow(X)) {
	XXprime[i,j] = sum(X[i,]*X[j,])
	if (i > j ) XXprime[j,i ] = XXprime[i,j]
	}

	rm(X); gc()

	ev = eigen(XXprime)
	e.values = formatC(ev$value/sum(ev$value)*100, digits=2, format='f')[1:kEv] # output % of total variation
	list (ev$vectors[,1:kEv], e.values)
	}
	#-----------------------------------------------------------------------------------------------


	#-----------------------------------------------------------------------------------------------
	#
	# Try this other pca function when the number of markers is smaller than
	# the number of subjects. You have to use the NormData function to
	# normalize the data first --- if to be the same as what EigenStrat does.
	# But the results should not be a lot different with or without
	# normalization. As long as the memory is OK, this should not take more
	# than half day.
	#
	#-----------------------------------------------------------------------------------------------
	# Calculate Normalized G matrix (NormData)
	# SNPs on * COLUMN * with 0,1,2, NA
	#-----------------------------------------------------------------------------------------------
	NormData <- function(G){
	for (i in 1:ncol(G)) { # repeat for each SNP
	p<-mean(G[,i], na.rm=T)/2
	G[,i]=(G[,i] - 2p)/sqrt(2p*(1-p))
	}
	G
	}
	#-----------------------------------------------------------------------------------------------
	# Direct calculation of covariance among m SNPs, where m < n
	# EVs will be used to form PCs for normG
	# SNP on Column
	#-----------------------------------------------------------------------------------------------
	pca2<-function(NormG, kEv=20) {
	X = as.matrix(NormG) # for matrix operation
	n=nrow(X)

	K = cov(X, use="pairwise.complete.obs") # cov gives covariance for columns
	full = eigen(K)
	EVs = full$vec[,1:kEv]; value = full$values
	rm (full, K);

	PCs = array(NA, dim=c(n,kEv))
	for (i in 1:n) for (j in 1:kEv) {
	t = X[i,]*EVs[,j] # elementwise product within vectors
	PCs[i,j] = mean(t,na.rm=T)
	}

	e.values = formatC(value/sum(value)*100, digits=2, format='f')[1:kEv] # output % of total variation
	list (PCs, e.values)
	}
	#-----------------------------------------------------------------------------------------------

	d <- read.table("N:/EPI/turnersd/4kforpca.txt.gz",T)
	d <- NormData(d)
	pcs <- pca2(d)
	save.image("N:/EPI/turnersd/pcs-1-with-d.RData")
	rm(d)
	save.image("N:/EPI/turnersd/pcs-2-without-d.RData")