Skip to content

Instantly share code, notes, and snippets.

@dmarcelinobr
Created December 31, 2015 05:22
Show Gist options
  • Save dmarcelinobr/158214ab9d53c902f920 to your computer and use it in GitHub Desktop.
Save dmarcelinobr/158214ab9d53c902f920 to your computer and use it in GitHub Desktop.
trying to reduce dimensionality of a golden questions survey in insurance Raw
################
# Setup
################
rm(list=(ls()))
#libs
library(reshape2)
library(plyr)
library(stringr)
#cluster&pca
library(cluster)
library(apcluster)
library(psych)
library(FactoMineR)
library(poLCA)
library(corrplot)
library(ggplot2)
#wd
setwd("C:/Users/fait/Dropbox/Cardif")
dir <- getwd()
################
# Data
################
#read in data
d_raw <- read.csv2("czech_raw_data_joined.csv",stringsAsFactors=F)
d_demo <- read.csv2("czech_raw_data_demo.csv",stringsAsFactors=F)
#reshape data so questions are in variables
d_t<-reshape(d_raw, idvar="uid", timevar="question", direction = "wide", drop="question_text")
d_t[is.na(d_t)]<-0
#join with demographics
d <- join(d_t,d_demo,by="uid")
#get question names
q <- unique(d_raw[c("question","question_text")])
#vector of perceived_risk variables
ps <- d[,str_detect(names(d),"(p.[0-9])")]
#vector of insurance willingness variables
is <- d[,str_detect(names(d),"(i.[0-9])")]
################
# Analysis
################
#compute correlations
cor <- cor(ps,use="pairwise.complete.obs")
cor[is.na(cor)]<-0
diag(cor)<-0
png(filename = "corrplot.png",1500, 1500, units = "px", pointsize = 12,type = "cairo-png")
corplot <- corrplot(cor,method="circle",outline=F,order="original",cl.ratio = 0.2,cl.cex=2,tl.cex=1,tl.col="#282626",pch.cex=2)
dev.off()
#mds on rows gives a lousy solution due to missing values
d_dist <- dist(d)
fit <- cmdscale(d_dist,eig=TRUE, k=2)
# plot solution
x <- fit$points[,1]
y <- fit$points[,2]
plot(x, y, xlab="Dim 1", ylab="Dim 2", main="MDS of respondents")
text(x, y, labels = d$uid, cex=.7)
#lets try daisy from cluster package which is a dissimilarity matrix that can handle pairwaise complete
d_dist <- daisy(ps,metric="euclidean",stand=F)
str(d_dist)
#pca from stats
pca <- princomp(ps,cor=T,na.action=na.omit)
pca$loadings
screeplot(pca)
#pca from psych
pca <- prcomp(ps, nfactors=10, rotate="varimax")
summary(pca)
screeplot(pca)
#pca from FactoMineR (vliv missingu na PCA ???)
pca <- PCA(ps)
pca$var
plot.PCA(pca)
#clustering that works kindof
clu <-pam(ps, 5, diss = F, metric = "euclidean", stand = F)
plot(silhouette(clu),main="Result of clustering, anything cluster with center over 0.5 is good")
table(clu$clustering)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment