Skip to content

Instantly share code, notes, and snippets.

@xccds
Created April 4, 2015 11:28
Show Gist options
  • Save xccds/b4021fb49226a70b9a8c to your computer and use it in GitHub Desktop.
Save xccds/b4021fb49226a70b9a8c to your computer and use it in GitHub Desktop.
# 读取
df = read.csv('2014data.csv',stringsAsFactors =FALSE)
df1 = df[4:53]
df2 = df[54:57]
# 整理
library(plyr)
df1_names = names(df1)
names(df1) = paste0('x',1:ncol(df1))
df2_names = names(df2)
names(df2) = paste0('y',1:ncol(df2))
map_func = function(x){
temp = mapvalues(x, from = c("强烈同意","同意","反对","强烈反对"),
to = c(2,1,-1,-2))
return(as.numeric(temp))
}
df1_1 = colwise(map_func)(df1)
df2_2 = df2
df2_2$y1 = ifelse(df2$y1=='F',1,0)
df2_2$y2 = 2015-df2_2$y2
df2_2$y2 = cut(df2_2$y2,breaks=c(0,18,22,25,30,35,40,50,60,70,120),
labels=1:10)
df2_2$y2 = as.numeric(df2_2$y2)
df2_2$y3 = mapvalues(df2$y3, from = c("0-25k","25k-50k","50k-75k","75k-100k","100k-150k","150k-300k","300k+"),
to = 1:7)
df2_2$y3 = as.numeric(df2_2$y3)
df2_2$y4 = mapvalues(df2$y4, from = c("初中及以下","高中","大学","研究生及以上"),
to = 1:4)
df2_2$y4 = as.numeric(df2_2$y4)
# 去除有问题数据
df3 = cbind(df1_1,df2_2)
df4 = df3[complete.cases(df3),]
df5 = subset(df5, !(y2==10))
df5 = subset(df5, !(y2==1&y4==4))
df5 = subset(df5, !(y2==1&y3>5))
im_func = function(x,y){
e=1e-8
px = matrix(prop.table(table(x)))
py = matrix(prop.table(table(y)))
pxy = matrix(prop.table(table(x,y)),ncol=nrow(py))
im = pxy*(log2(pxy+e) - log2(e+px %*% t(py)))
nomi = sum(im)
denomi = -0.5*(sum(px*log2(px+e))+sum(py*log2(py+e)))
return(nomi/denomi)
}
m = ncol(df5)
result = matrix(nrow=m,ncol=m)
for (i in 1:m){
for (j in 1:i){
result[i,j] = im_func(df5[[i]],df5[[j]])
}
}
diag(result) = 0
# 哪些问题最相关
max_v=max(result[1:50,1:50],na.rm = T)
which(result==max_v,arr.ind = T)
df1_names[c(3,6)]
table(df5$x3,df5$x6)
# 学历和哪个问题最相关
order(result[54,],decreasing = T)
df1_names[41]
table(df5$x41,df5$y4)
# 年龄 和那个问题有关
order(result[52,],decreasing = T)
df1_names[35]
table(df5$x30,df5$y2)
# 收入和那个问题有关
order(result[53,],decreasing = T)
df1_names[35]
table(df5$x35,df5$y3)
# 性别和那个问题有关
order(result[51,],decreasing = T)
df1_names[30]
table(df5$x30,df5$y1)
# 模型
library(gbm)
model = gbm(y3~.,data = df5,
distribution = "multinomial",
n.trees = 200,
shrinkage = 0.01,
train.fraction = 0.8,
cv.folds=5)
pred = predict(model,type="response")
pred = matrix(pred[,,1],ncol=7)
pred_y = apply(pred,1,which.max)
coef = relative.influence(model)
sort(coef[coef>0])
df1_names[16]
table(df5$x16,df5$y3)
df1_names[41]
table(df5$x41,df5$y3)
df1_names[12]
table(df5$x12,df5$y3)
# # 政治
# df5$poli = rowMeans(df5[,1:20])
# # 经济
# df5$econ = rowMeans(df5[,21:40])
# # 文化
# df5$cult = rowMeans(df5[,41:50])
#
# # cluster
# library(fpc)
# pka <- kmeansruns(df5[,c('poli','econ','cult')],krange=2:6,critout=TRUE,runs=2,criterion="asw")
@chientung91
Copy link

line #39, did you mean:

df5 = subset(df4, !(y2==10))

@cmldyu
Copy link

cmldyu commented Dec 30, 2016

thx a lot~

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment