Created
July 10, 2016 10:11
-
-
Save teos0009/3986bfdfe455a0756734d57b822c05b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
library(RXKCD) | |
library(XML) | |
library(tm) | |
library(wordcloud) | |
library(RColorBrewer) | |
library(rJava) | |
library(RWeka) | |
library(Snowball) | |
library(stringr) | |
library(igraph) | |
ap.df<-read.csv("polyforce21.csv",header = TRUE, sep="," , | |
stringsAsFactor=FALSE,na.strings = c("NA","","NULL")) | |
attach(ap.df) | |
names(ap.df) | |
summary(ap.df) | |
ap.sub1<-ap.df[,c(1,2,4)]#sub1 contains tweet,author and date | |
names(ap.sub1) | |
head(ap.sub1$Title) | |
#==select data of interest only==== | |
grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=F) | |
#local<- grep("SingaporePoly",ap.sub1$Title,value = TRUE,ignore.case = F,fixed = TRUE) | |
#local <- ap.sub1$Title[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic)((?:\\b)+)",ap.sub1$Title)] | |
#local <- ap.sub1$Title[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=TRUE)] | |
local <-grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=T) | |
summary(local) | |
names(local) | |
head(local) | |
native <- ap.sub1[grep("(SingaporePoly|NgeeAnnNP|temasekpoly|Nanyang Polytechnic|Singapore Polytechnic|Ngee Ann Polytechnic|Temasek Polytechnic|Republic Polytechnic|sp|np|nyp|tp|rp|poly)((?:\\b)+)", ap.sub1$Title,ignore.case=TRUE, value=F),] | |
summary(native) | |
names(native) | |
head(native) | |
grep("RT @temasekpoly",native$Title,ignore.case=TRUE, value=F) #only ref number in cells | |
grep("RT @temasekpoly",native$Title,ignore.case=TRUE, value=T) #got value in cells | |
#=select interested rows= | |
selPorsche<-ap.sub1[grep("Porsche offers internships to Ngee Ann Polytechnic",ap.sub1$Title),] | |
names(selPorsche) | |
head(selPorsche) | |
#=====who retweet======== | |
summary(native) | |
head(native) | |
grep("@STcom",native$Title,ignore.case=TRUE, value=T) | |
native$Title | |
##===https://sites.google.com/site/miningtwitter/questions/user-tweets/who-retweet== | |
#grep("(RT|via)((?:\\b\\W*@\\w+)+)", ap.sub1$Title, ignore.case=TRUE, value=TRUE) #disp sample of grep | |
grep("(RT|via)((?:\\b\\W*@\\w+)+)", native$Title, ignore.case=TRUE, value=TRUE) #disp sample of grep | |
# which tweets are retweets | |
#rt_patterns = grep("(RT|via)((?:\\b\\W*@\\w+)+)",ap.sub1$Title, ignore.case=TRUE) #store grep result | |
rt_patterns = grep("(RT|via)((?:\\b\\W*@\\w+)+)",native$Title, ignore.case=TRUE) #store grep result | |
rt_patterns | |
# show retweets (these are the ones we want to focus on) | |
ap.sub1$Title[rt_patterns] #disp those grep only | |
native$Title[rt_patterns] | |
summary(ap.sub1$Title[rt_patterns]) | |
summary(native$Title[rt_patterns]) | |
head(ap.sub1$Title[rt_patterns])#produce diff head to native | |
head(native$Title[rt_patterns]) #produce diff head to sub1 | |
#mind boggling.... | |
grep("RT @temasekpoly", ap.sub1$Title[rt_patterns],ignore.case=TRUE, value=TRUE) #zero items | |
grep("RT @temasekpoly", native$Title,ignore.case=TRUE, value=TRUE) #270 items | |
###rt_patterns is like pointer/ref, point towards diff df got diff result | |
#======processs nodes and edge=== | |
# create list to store user names | |
who_retweet = as.list(1:length(rt_patterns)) | |
#head(who_retweet) | |
who_post = as.list(1:length(rt_patterns)) | |
#head(who_post) | |
# for loop | |
for (i in 1:length(rt_patterns)) | |
{ | |
# get tweet with retweet entity | |
#twit = ap.sub1$Title[[rt_patterns[i]]] | |
#author = ap.sub1$Author[[rt_patterns[i]]] | |
twit = native$Title[[rt_patterns[i]]] | |
author = native$Author[[rt_patterns[i]]] | |
# get retweet source | |
poster = str_extract_all(twit, | |
"(RT|via)((?:\\b\\W*@\\w+)+)") | |
#remove ':' | |
poster = gsub(":", "", unlist(poster)) | |
# name of retweeted user | |
who_post[[i]] = gsub("(RT @|via @)", "", poster, ignore.case=TRUE) | |
# name of retweeting user | |
who_retweet[[i]] = rep(author, length(poster))#rep = replicate | |
} | |
# unlist | |
who_post = unlist(who_post) | |
who_retweet = unlist(who_retweet) | |
#===gen igraph======== | |
# two column matrix of edges | |
retweeter_poster = cbind(who_retweet, who_post) | |
#head(retweeter_poster) | |
#write.csv(retweeter_poster, file = "RTnS.csv") | |
# generate graph | |
rt_graph = graph.edgelist(retweeter_poster) | |
#write.graph(rt_graph, "tweetedge.txt", format=c("edgelist"))#export edge list to file | |
#write.graph(rt_graph, "tweetedge.pajek", format=c("pajek"))#export edge list to file | |
# get vertex names | |
ver_labs = get.vertex.attribute(rt_graph, "name", index=V(rt_graph)) | |
#head(ver_labs) | |
#get vertex degree | |
ver_deg = degree(rt_graph) | |
#head(ver_deg) | |
V(rt_graph) [ degree(rt_graph) > 10 ] #nodes with degree above 10 | |
#get diameter and highlight it | |
dia <- get.diameter(rt_graph) | |
E(rt_graph, path=dia)$color <- "white" | |
E(rt_graph, path=dia)$width <- 10 | |
V(rt_graph)[ dia ]$label.color <- "white" | |
V(rt_graph)[ dia ]$color <- "white" | |
# choose some layout | |
#glay = layout.fruchterman.reingold(rt_graph) | |
#glay = layout.kamada.kawai(rt_graph)#very slow | |
glay = layout.fruchterman.reingold(rt_graph, niter = 2000,area = vcount(g)^5) | |
#===plot graph=============== | |
# plot | |
pdf("TEST retweet net.pdf")#save as pdf | |
par(bg="gray15", mar=c(1,1,1,1)) | |
plot(rt_graph, layout=glay, | |
vertex.color="gray25", | |
#vertex.color=V(rt_graph)$color, | |
#vertex.size=10, | |
vertex.size = ver_deg/100, | |
vertex.label= ver_labs, #ver_labs or NA not to display label | |
vertex.label.family="sans", | |
vertex.shape="none", | |
vertex.label.color=hsv(h=0, s=0, v=.95, alpha=0.5), | |
#vertex.label.cex=0.85, | |
vertex.label.cex=ver_deg/50, | |
#vertex.label.cex=log10(ver_deg), | |
edge.arrow.size=0.8, | |
edge.arrow.width=0.5, | |
edge.width=0.1, | |
edge.color=hsv(h=.95, s=1, v=.7, alpha=0.5) | |
#edge.color==E(rt_graph)$color | |
) | |
# add title | |
title("\nTweets with 'satisfied parameters': Who retweets whom", | |
cex.main=1, col.main="gray95") | |
dev.off()#end save pdf | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment