Last active
December 16, 2015 21:48
-
-
Save michalskop/5502013 to your computer and use it in GitHub Desktop.
W PCA (in R)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Principal factor analysis with "natural" weights - Model of parliamentary voting. | |
| (PCA equals to Multidimensional scaling) | |
| Cutting lines: perpendicular to normal vectors of divisions, minimizing the sum of "error distances" in given hyperplane |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Weighted PCA |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CUTTING LINES | |
# based on overall model | |
#minimizing function | |
f_cutting_lines=function(beta0) -1*sum(apply(cbind(y*(x%*%beta+beta0),zeros),1,min)) | |
#how many dimensions? | |
n_first_dimensions = 2 | |
#preparing variables | |
normals = Hy[,1:n_first_dimensions] | |
loss_f = data.frame(matrix(0,nrow=dim(H)[1],ncol=4)) | |
colnames(loss_f)=c("Parameter1","Loss1","Parameter_1","Loss_1") | |
parameters = data.frame(matrix(0,nrow=dim(H)[1],ncol=3)) | |
colnames(parameters)=c("Parameter","Loss","Direction") | |
xfull = t(t(He$vectors[,1:n_first_dimensions]) * sqrt(He$values[1:n_first_dimensions])) | |
#calculating all cutting lines | |
i=1 | |
for (i in as.numeric(1:dim(H)[1])) { | |
beta = Hy[i,1:n_first_dimensions] | |
y = t(as.matrix(H[i,]))[,pI] | |
x = xfull[which(!is.na(y)),] | |
y = y[which(!is.na(y))] | |
zeros = as.matrix(rep(0,length(y))) | |
#** place for improvement: "1000" | |
res1 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary | |
#the sign is arbitrary, the real result may be -1*; we need to minimize the other way as well!! | |
y=-y | |
res2 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary | |
#the real parameter is the one with lower loss function | |
#theoretically should be the same (either +1 or -1) for all divisions(rows), however, due to missing data, the calculation may lead to a few divisions with the other direction | |
loss_f[i,] = c(res1$par,res1$value,res2$par,res2$value) | |
if (res1$value<=res2$value) { | |
parameters[i,] = c(res1$par,res1$value,1) | |
} else { | |
parameters[i,] = c(res2$par,res2$value,-1) | |
} | |
} | |
CuttingLines = list(normals=normals,parameters=parameters,loss_function=loss_f,weights=cbind(w1,w2)) | |
## | |
#cutting lines: | |
plot(Hproj[,1],Hproj[,2]) | |
I = w1*w2 > .5 | |
for (i in as.numeric(1:dim(H)[1])) { | |
if (I[i] && ((i %% 10) == 0)) { | |
beta = CuttingLines$normals[i,] | |
beta0 = CuttingLines$parameters$Parameter[i] | |
abline(-beta0/beta[2],-beta[1]/beta[2]) | |
break | |
} | |
} | |
# only second half | |
plot(Hproj[,1],Hproj[,2]) | |
for (j in as.numeric(1:round(dim(H)[1]/2))) { | |
i = round(dim(H)[1]/2) + j | |
if (I[i] && ((i %% 10) == 0)) { | |
beta = CuttingLines$normals[i,] | |
beta0 = CuttingLines$parameters$Parameter[i] | |
abline(-beta0/beta[2],-beta[1]/beta[2]) | |
#break | |
} | |
} | |
# when y=0, x is ? | |
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,1],ylim=c(-100,100)) | |
# slopes | |
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,2],ylim=c(-10,10)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NEW MP (or partial) | |
#New persons / partial (like a projection of divisions from a smaller time interval into all divisions) | |
analdb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",analysisname,"/",analysisname,".sqlite3",sep="")) | |
datadb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",dataname,".sqlite3",sep="")) | |
rotation = strsplit(dbGetQuery(analdb,"SELECT orientation FROM analysis_info")[1,1],",")[[1]] | |
rot_mp_rank = which(dimnames(Hcov)[[1]] == rotation[1]) | |
for (j in 2:length(rotation)) { | |
if (Hproj[rot_mp_rank,j-1] * as.double(rotation[j]) < 0) { | |
U[,j-1] = -1*U[,j-1] | |
} | |
} | |
aU = abs(U) | |
mp_names = dbGetQuery(datadb,"SELECT name FROM mp ORDER BY CAST(code as INTEGER)") | |
attr(G,'dimnames')[2][[1]] = mp_names$name | |
#attr(Hw0c,'dimnames')[2][[1]] = mp_names$name[pI] | |
interval_names = dbGetQuery(analdb,"SELECT distinct(interval_name) FROM analysis_division_in_interval ORDER BY interval_name") | |
#lower limit to eliminate from projections | |
lo_limitT = lo_limit | |
for (i in 1:dim(interval_names)[1]) { | |
TIf = dbGetQuery(analdb,paste("SELECT division_code, CASE interval_name='",interval_names[i,],"' WHEN 1 THEN 'TRUE' ELSE 'FALSE' END as in_interval FROM analysis_division_in_interval ORDER BY division_code",sep="")) | |
TIf$in_interval = as.logical(TIf$in_interval) | |
max_division_code = TIf$division_code[max(which(as.logical(TIf$in_interval)))] | |
min_division_code = TIf$division_code[min(which(as.logical(TIf$in_interval)))] | |
TI = TIf$in_interval | |
GTc = G[,pI] | |
GTc[!TI,] = NA | |
HTc = (GTc - attr(H,"scaled:center"))/attr(H,"scaled:scale") | |
HTIc = HTc | |
HTIc[!is.na(HTIc)] = 1 | |
HTIc[is.na(HTIc)] = 0 | |
HTw0c = HTc * w1 * w2 | |
HTw0c[is.na(HTw0c)] = 0 | |
#weights for non missing data division x persons | |
HTIcw = HTIc*w1*w2 | |
#sum of weights of divisions for each persons | |
tmp = apply(HTIcw,2,sum) | |
pTw = tmp/max(tmp) | |
#index of persons in calculation | |
pTI = pTw > lo_limitT | |
#weighted scaled with NA->0 and cutted persons with too few votes division x persons | |
HTw0cc = HTw0c[,pTI] | |
#index of missing data cutted persons with too few votes divisions x persons | |
HTIcc = HTIc[,pTI] | |
dweights = t(t(aU)%*%HTIcc / apply(aU,2,sum)) #person x division | |
dweights[is.na(dweights)] = 0 | |
HTw0ccproj = t(HTw0cc)%*%U / dweights | |
parties = as.matrix(dbGetQuery(datadb,paste('SELECT COALESCE(NULLIF(tmax.code,""),tmin.code) as code, COALESCE(NULLIF(tmax.color,""),tmin.color) as color FROM (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',max_division_code,'") as tmax LEFT JOIN (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',min_division_code,'") as tmin ON tmax.mp_code=tmin.mp_code ORDER BY CAST(tmax.mp_code as INTEGER)',sep=""))) | |
partiescc = (parties[pI,])[pTI,] | |
#mp rank id | |
mp_rank_id_cc = seq(1,dim(HTw0c)[2],1)[,pTI] | |
write.csv(format(cbind(HTw0ccproj[,1:2],partiescc,mp_rank_id_cc),digits=3),file=paste(path,dataname,"/",analysisname,"/",interval_names[i,],"_result.csv",sep="")) | |
#************************* | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#matrix (operations): | |
en.wikipedia.org/wiki/Matrix_(mathematics) | |
#rotation matrix: | |
http://en.wikipedia.org/wiki/Rotation_matrix | |
#SVD: | |
http://en.wikipedia.org/wiki/Singular_value_decomposition | |
#Eigendecomposition | |
http://en.wikipedia.org/wiki/Eigendecomposition | |
#SVD vs. eigendec. | |
http://stats.stackexchange.com/questions/14002/whats-the-difference-between-principal-components-analysis-and-multidimensional | |
#SVD example: | |
http://polisci2.ucsd.edu/aruizeuler/SVD.html | |
#normal and distance point,plane | |
http://en.wikipedia.org/wiki/Normal_%28geometry%29 | |
http://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line | |
http://www.vitutor.com/geometry/distance/point_plane.html | |
# multiply rows of matrix by vector in R (speed) | |
http://stackoverflow.com/questions/3643555/multiply-rows-of-matrix-by-vector | |
#ideal points | |
http://voteview.com/ideal_point_history.htm | |
#wnominate in R | |
http://cran.r-project.org/web/packages/wnominate/vignettes/wnominate.pdf | |
http://cran.r-project.org/web/packages/wnominate/wnominate.pdf | |
http://rss.acs.unt.edu/Rdoc/library/pscl/html/rollcall.html | |
http://voteview.com/senate109.htm | |
#US roll-call downloads | |
http://voteview.com/downloads.asp#ROLLCALLDWNL | |
#linear discriminant analysis (not used finally) | |
http://en.wikipedia.org/wiki/Linear_discriminant_analysis | |
#logistic regression (not used finally) | |
http://en.wikipedia.org/wiki/Linear_discriminant_analysis | |
#SVM and kernels (not used finally) | |
http://en.wikipedia.org/wiki/Support_vector_machine | |
http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM | |
http://stats.stackexchange.com/questions/26293/how-to-obtain-decision-boundaries-from-linear-svm-in-r | |
http://stats.stackexchange.com/questions/5056/computing-the-decision-boundary-of-a-linear-svm-model/5080#5080 | |
http://stats.stackexchange.com/questions/25387/problem-with-e1071-libsvm | |
http://stats.stackexchange.com/questions/17955/svm-with-unequal-group-sizes-in-training-data?rq=1 | |
http://cran.r-project.org/web/packages/kernlab/kernlab.pdf | |
http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic_notes.pdf | |
http://stackoverflow.com/questions/3282916/class-weight-syntax-in-kernlab | |
http://www.robots.ox.ac.uk/~az/lectures/ml/lect2.pdf | |
http://stat.ethz.ch/R-manual/R-patched/library/stats/html/density.html | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
p1 | p2 | p3 | p4 | |
---|---|---|---|---|
1 | 1 | 1 | -1 | |
1 | 1 | -1 | -1 | |
1 | 1 | -1 | -1 | |
1 | -1 | 1 | -1 | |
1 | -1 | 1 | -1 | |
1 | -1 | -1 | -1 | |
-1 | -1 | -1 | 1 | |
-1 | -1 | -1 | 1 | |
1 | 1 | -1 | -1 | |
1 | -1 | 1 | -1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install.packages("reshape2") | |
library("reshape2") | |
#install.packages("sqldf") | |
library("sqldf") | |
path = "~/project/mpv_motion/dev/" | |
#dataname = "cz_psp_2006-2010_a1_1" | |
#analysisname = 'cz_psp_2006-2010_a1_1q' | |
dataname = "cz_kromeriz_2012" | |
analysisname = 'cz_kromeriz_2012_1h' | |
#raw data in 3 columns (values in [-1,1]) | |
Graw = read.table(paste(path,dataname,"/",dataname,".csv",sep=""), header=FALSE, sep=",") | |
#data divisions x persons | |
G = acast(Graw,V2~V1,value.var='V3') | |
#scaled divisions x persons (mean=0 and sd=1 for each division) | |
H=t(scale(t(G),scale=TRUE)) | |
#weights | |
#weights 1, based on number of persons in division | |
w1 = apply(abs(G)==1,1,sum,na.rm=TRUE)/max(apply(abs(G)==1,1,sum,na.rm=TRUE)) | |
w1[is.na(w1)] = 0 | |
#weights 2, "100:100" vs. "195:5" | |
w2 = 1 - abs(apply(G==1,1,sum,na.rm=TRUE) - apply(G==-1,1,sum,na.rm=TRUE))/apply(!is.na(G),1,sum) | |
w2[is.na(w2)] = 0 | |
#analytics | |
plot(w1) | |
plot(w2) | |
plot(w1*w2) | |
#weighted scaled divisions x persons | |
Hw = H * w1 * w2 | |
#index of missing data | |
#index of missing data divisions x persons | |
HI = H | |
HI[!is.na(H)]=1 | |
HI[is.na(H)] = 0 | |
#weighted scaled with NA->0 division x persons | |
Hw0 = Hw | |
Hw0[is.na(Hw)]=0 | |
#eliminate persons with too few votes (weighted) | |
#lower limit to eliminate from calculations | |
lo_limit = .1 | |
#weights for non missing data division x persons | |
HIw = HI*w1*w2 | |
#sum of weights of divisions for each persons | |
tmp = apply(HIw,2,sum) | |
pw = tmp/max(tmp) | |
#index of persons in calculation | |
pI = pw > lo_limit | |
#weighted scaled with NA->0 and cutted persons with too few votes division x persons | |
Hw0c = Hw0[,pI] | |
#index of missing data cutted persons with too few votes divisions x persons | |
HIc = HI[,pI] | |
#"covariance" matrix adjusted according to missing data | |
Hcov=t(Hw0c)%*%Hw0c * 1/(t(HIc)%*%HIc) * (dim(Hw0c)[1]) | |
#Hcov=t(Hw0)%*%Hw0 * 1/(t(HI)%*%HI-1) * (dim(H)[1]-1) | |
#substitution of missing data in "covariance" matrix | |
Hcov0 = Hcov | |
Hcov0[is.na(Hcov)] = 0 #********* | |
#eigendecomposition | |
He=eigen(Hcov0) | |
# V (rotation values of persons) | |
V = He$vectors | |
#projected divisions into dimensions | |
Hy=Hw0c%*%V | |
#analytics | |
plot(Hy[,1],Hy[,2]) | |
plot(sqrt(He$values[1:10])) | |
#sigma matrix | |
sigma = diag(sqrt(He$values)) | |
sigma[is.na(sigma)] = 0 | |
#projection of persons into dimensions | |
Hproj = t(sigma%*%t(V)) | |
#analytics | |
plot(Hproj[,1],Hproj[,2]) | |
#sigma^-1 matrix | |
sigma_1 = diag(sqrt(1/He$values)) | |
sigma_1[is.na(sigma_1)] = 0 | |
# U (rotation values of divisions) | |
U = Hw0c%*%V%*%sigma_1 | |
#U%*%sigma%*%t(V) != Hw0c ;because of adjusting of "covariance" matrix | |
#THE PROBLEM (if T - overall otherwise see Hproj2) | |
plot(He$vectors[,1]*sqrt(He$value[1]),He$vectors[,2]*sqrt(He$value[2])) | |
#second projection | |
Hproj2 = t(t(U) %*% Hw0c) | |
#without missing values should be equal | |
#analytics | |
plot(Hproj[,1],Hproj2[,1]) | |
plot(Hproj[,2],Hproj2[,2]) | |
#plot(HTw0ccproj[,1],HTw0ccproj[,2]) | |
#plot(Hproj[,1],HTw0ccproj[,1]) | |
#plot(Hproj[,2],HTw0ccproj[,2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment