-
-
Save xccds/4276525 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 数据读入 | |
data <- read.csv('qq.csv',T,stringsAsFactors=F) | |
data <- data[-nrow(data),] # 最后一行有问题,删除 | |
library(stringr) | |
library(plyr) | |
library(lubridate) | |
library(ggplot2) | |
library(reshape2) | |
library(igraph) | |
# 数据整理 | |
# 将字符串中的日期和时间划分为不同变量 | |
temp1 <- str_split(data$time,' ') | |
result1 <- ldply(temp1,.fun=NULL) | |
names(result1) <- c('date','clock') | |
#分离年月日 | |
temp2 <- str_split(result1$date,'/') | |
result2 <- ldply(temp2,.fun=NULL) | |
names(result2) <- c('year','month','day') | |
# 分离小时分钟 | |
temp3 <- str_split(result1$clock,':') | |
result3 <- ldply(temp3,.fun=NULL) | |
names(result3) <- c('hour','minutes') | |
# 合并数据 | |
newdata <- cbind(data,result1,result2,result3) | |
# 转换日期为时间格式 | |
newdata$date <- ymd(newdata$date) | |
# 提取星期数据 | |
newdata$wday <- wday(newdata$date) | |
# 转换数据格式 | |
newdata$month <- ordered(as.numeric(newdata$month) ) | |
newdata$year <- ordered(newdata$year) | |
newdata$day <- ordered(as.numeric(newdata$day)) | |
newdata$hour <- ordered(as.numeric(newdata$hour)) | |
newdata$wday <- ordered(newdata$wday) | |
# 关于时间的一元描述 | |
# 观察时间相关各变量的频数分布 | |
# 周一和周日聊天不多,难道说是周一要安心上班? | |
qplot(wday,data=newdata,geom='bar') | |
# 白天上班的时间聊天比较多嘛,下午形成高峰。 | |
qplot(hour,data=newdata,geom='bar') | |
# 关于用户的频度描述 | |
# 前十大发言最多用户 | |
user <- as.data.frame(table(newdata$id)) | |
user <- user[order(user$Freq,decreasing=T),] | |
user[1:10,] | |
topuser <- user[1:10,]$Var1 | |
# 活跃天数计算 | |
# 将数据展开为宽表,每一行为用户,每一列为日期,对应数值为发言次数 | |
flat.day <- dcast(newdata,id~date,length,value.var='date') | |
flat.mat <- as.matrix(flat.day[-1]) #转为矩阵 | |
# 转为0-1值,以观察是否活跃 | |
flat.mat <- ifelse(flat.mat>0,1,0) | |
# 根据上线天数求和 | |
topday <- data.frame(flat.day[,1],apply(flat.mat,1,sum)) | |
names(topday) <- c('id','days') | |
topday <- topday[order(topday$days,decreasing=T),] | |
# 获得前十大活跃用户 | |
topday[1:10,] | |
# 观察每天的发言次数 | |
# online.day为每天的发言次数 | |
online.day <- sapply(flat.day[,-1],sum) | |
tempdf <- data.frame(time=ymd(names(online.day )),online.day ) | |
qplot(x=time,y=online.day ,ymin=0,ymax=online.day , | |
data=tempdf,geom='linerange') | |
ggsave('2.png') | |
# 观察到有少数峰值日,看超过200次发言以上是哪几天 | |
names(which(online.day>200) | |
#根据flat.day数据观察每天活跃用户变化 | |
# numday为每天发言人数 | |
numday <- apply(flat.mat,2,sum) | |
tempdf <- data.frame(time=ymd(names(numday)),numday) | |
qplot(x=time,y=numday,ymin=0,ymax=numday, | |
data=tempdf,geom='linerange') | |
ggsave('3.png') | |
#直方图观察 | |
qplot(x=numday,data=tempdf,geom='histogram') | |
# 当某天登录人数增加的话,发言数也会增加吗? | |
tempdf <- data.frame(time=ymd(names(online.day )),people=numday, | |
speech=online.day) | |
qplot(x=people,y=speech , | |
data=tempdf,geom=c('point','smooth')) | |
# 再观察十强选手的日内情况 | |
flat.hour <- dcast(newdata,id~hour,length,value.var='hour', | |
subset=.(id %in% topuser)) | |
# 平行坐标图 | |
hour.melt <- melt(flat.hour) | |
p <- ggplot(data=hour.melt,aes(x=variable,y=value)) | |
p + geom_line(aes(group=id,color=id))+ | |
theme_bw()+ | |
opts(legend.position = "none") | |
# 连续对话的次数,以三十分钟为间隔 | |
newdata$realtime <- strptime(newdata$time,'%Y/%m/%d %H:%M') | |
# 时间排序有问题,按时间重排数据 | |
newdata2 <- newdata[order(newdata$realtime),] | |
# 将数据按讨论来分组 | |
group <- rep(1,11279) | |
for (i in 2:11279) { | |
d <- as.numeric(difftime(newdata2$realtime[i], | |
newdata2$realtime[i-1], | |
units='mins')) | |
if ( d <30) { | |
group[i] <- group[i-1] | |
} else {group[i] <- group[i-1]+1} | |
} | |
barplot(table(group)) | |
# 得到719多组对话 | |
newdata2$group <- group | |
# igraph进行十强之间的网络分析 | |
# 建立关系矩阵,如果两个用户同时在一次群讨论中出现,则计数+1 | |
newdata3 <- dcast(newdata2, id~group, sum, | |
value.var='group', | |
subset=.(id %in% topuser)) | |
newdata4 <- ifelse(newdata3[,-1] > 0, 1, 0) | |
rownames(newdata4) <- newdata3[,1] | |
relmatrix <- newdata4 %*% t(newdata4) | |
# 很容易看出哪两个人聊得最多,6cha4376和4cha3875,有基情? | |
deldiag <- relmatrix-diag(diag(relmatrix)) | |
which(deldiag==max(deldiag),arr.ind=T) | |
# 根据关系矩阵画社交网络画 | |
g <- graph.adjacency(relmatrix,weighted=T,mode='undirected') | |
g <-simplify(g) | |
V(g)$label<-rownames(relmatrix) | |
V(g)$degree<- degree(g) | |
layout1 <- layout.fruchterman.reingold(g) | |
egam <- 10*E(g)$weight/max(E(g)$weight) | |
egam <- (log(E(g)$weight)+1) / max(log(E(g)$weight)+1) | |
V(g)$label.cex <- V(g)$degree / max(V(g)$degree)+ .2 | |
V(g)$label.color <- rgb(0, 0, .2, .8) | |
V(g)$frame.color <- NA | |
E(g)$width <- egam | |
E(g)$color <- rgb(0, 0, 1, egam) | |
plot(g, layout=layout1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment