Created
December 31, 2015 05:22
-
-
Save dmarcelinobr/158214ab9d53c902f920 to your computer and use it in GitHub Desktop.
trying to reduce dimensionality of a golden questions survey in insurance Raw
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################ | |
# Setup | |
################ | |
rm(list=(ls())) | |
#libs | |
library(reshape2) | |
library(plyr) | |
library(stringr) | |
#cluster&pca | |
library(cluster) | |
library(apcluster) | |
library(psych) | |
library(FactoMineR) | |
library(poLCA) | |
library(corrplot) | |
library(ggplot2) | |
#wd | |
setwd("C:/Users/fait/Dropbox/Cardif") | |
dir <- getwd() | |
################ | |
# Data | |
################ | |
#read in data | |
d_raw <- read.csv2("czech_raw_data_joined.csv",stringsAsFactors=F) | |
d_demo <- read.csv2("czech_raw_data_demo.csv",stringsAsFactors=F) | |
#reshape data so questions are in variables | |
d_t<-reshape(d_raw, idvar="uid", timevar="question", direction = "wide", drop="question_text") | |
d_t[is.na(d_t)]<-0 | |
#join with demographics | |
d <- join(d_t,d_demo,by="uid") | |
#get question names | |
q <- unique(d_raw[c("question","question_text")]) | |
#vector of perceived_risk variables | |
ps <- d[,str_detect(names(d),"(p.[0-9])")] | |
#vector of insurance willingness variables | |
is <- d[,str_detect(names(d),"(i.[0-9])")] | |
################ | |
# Analysis | |
################ | |
#compute correlations | |
cor <- cor(ps,use="pairwise.complete.obs") | |
cor[is.na(cor)]<-0 | |
diag(cor)<-0 | |
png(filename = "corrplot.png",1500, 1500, units = "px", pointsize = 12,type = "cairo-png") | |
corplot <- corrplot(cor,method="circle",outline=F,order="original",cl.ratio = 0.2,cl.cex=2,tl.cex=1,tl.col="#282626",pch.cex=2) | |
dev.off() | |
#mds on rows gives a lousy solution due to missing values | |
d_dist <- dist(d) | |
fit <- cmdscale(d_dist,eig=TRUE, k=2) | |
# plot solution | |
x <- fit$points[,1] | |
y <- fit$points[,2] | |
plot(x, y, xlab="Dim 1", ylab="Dim 2", main="MDS of respondents") | |
text(x, y, labels = d$uid, cex=.7) | |
#lets try daisy from cluster package which is a dissimilarity matrix that can handle pairwaise complete | |
d_dist <- daisy(ps,metric="euclidean",stand=F) | |
str(d_dist) | |
#pca from stats | |
pca <- princomp(ps,cor=T,na.action=na.omit) | |
pca$loadings | |
screeplot(pca) | |
#pca from psych | |
pca <- prcomp(ps, nfactors=10, rotate="varimax") | |
summary(pca) | |
screeplot(pca) | |
#pca from FactoMineR (vliv missingu na PCA ???) | |
pca <- PCA(ps) | |
pca$var | |
plot.PCA(pca) | |
#clustering that works kindof | |
clu <-pam(ps, 5, diss = F, metric = "euclidean", stand = F) | |
plot(silhouette(clu),main="Result of clustering, anything cluster with center over 0.5 is good") | |
table(clu$clustering) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment