teos0009 · July 10, 2016 09:40
diff --git a/tmsubset.r b/tmsubset.r
 install.packages("RWeka")
 install.packages("rJava")
 install.packages("Snowball")

 library(RXKCD)
 library(XML)
 library(tm)
 library(wordcloud)
 library(RColorBrewer)
 library(rJava)
 library(RWeka)
 library(Snowball)

 require(XML)
 require(RXKCD)
 require(tm)
 require(wordcloud)
 require(RColorBrewer)
 require(rJava)
 require(RWeka)
 require(Snowball)

 #install.packages("Snowball")#use by stemming of a word
 #install.packages("rJava")#need rWeka

 ##Note: there are some handy, basic Twitter related functions here:
 ##https://github.com/matteoredaelli/twitter-r-utils
 #For example:
 #RemoveAtPeople <- function(tweet) {
 #  gsub("@\\w+", "", tweet)
 #}
 #Then for example, remove @'d names
 #tweets <- as.vector(sapply(tw.df$text, RemoveAtPeople))

 ap.df<-read.csv("sjteo links posted.csv",header = TRUE, sep="," , 
 stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))

 attach(ap.df)#changes to data will not update. detach then attach to reflect changes

 #check col name
 names(ap.df)
 #check particular col
 ap.df$url#disp all data in col url
 ap.df$url[766]#disp data[766] in col url

 ap.sub1<-ap.df[,c(3,4,5,6)]
 #3 is owner comment,4,time created, 5 title,6 summary
 ap.sub1[,c(1)]#comment by me
 ap.sub1[,c(3)]#title by me
 ap.sub1[,c(4)]#summary text by ori source


 #ap.df<-read.csv("sjteo inbox1.csv",header = TRUE, stringsAsFactor=FALSE)
 #ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 3])))#title of post
 #ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 1])))#comment by me
 ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 4])))#summary by ori


 ap.corpus <- tm_map(ap.corpus, removeNumbers)
 #ap.corpus <- tm_map(ap.corpus, stripWhitespace)
 ap.corpus <- tm_map(ap.corpus, tolower)
 ap.corpus <- tm_map(ap.corpus,stemDocument)#preserve root words
 #cant stem coz rJava No CurrentVersion entry in key
 ap.corpus <- tm_map(ap.corpus, removePunctuation)


 #ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
 # remove generic and custom stopwords
 my_stopwords <- c(stopwords("english"),"quot","quotquotquot","null","sj","teo", "pls","mrchua","ill","dun","lol","btw","dont","yeah",
 "thx","name","word","nope","teo","dad","nov","earlier","cockney","time","sir","okok","lady",
 "coz","guys","using","girls","ercan","didnt","etc","lots","hehehe","stuff","hows","previous","shld"
 ,"haha","cedric","anyway","okie","sorry","timeline","photos","http","www","com","chk","amp")
 ap.corpus <- tm_map(ap.corpus, removeWords, my_stopwords)


 inspect(ap.corpus)#debug only
 ap.tdm <- TermDocumentMatrix(ap.corpus)
 #inspect(ap.tdm[1:100,1:10])#debug only
 #head(ap.tdm)#debug only
 #names(ap.tdm)#debug only

 ap.m <- as.matrix(ap.tdm)
 ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
 ap.d <- data.frame(word = names(ap.v),freq=ap.v)
 table(ap.d$freq)
 pal2 <- brewer.pal(8,"Dark2")


 #find frequent terms
 findFreqTerms(ap.tdm, 5)

 #find correlations
 findAssocs(ap.tdm,"arduino",0.20)
 findAssocs(ap.tdm,"makers",0.35)

 #clustering
 #d<-dist(ap.df, method="euclidean")#dissimilarity matrix
 #clusters<-hclust(d=d,method="ward")#ward's method to find clusters


 png("STEMM owner summary ori.png", width=3280,height=1800)
 #png("owner comment.png", width=2280,height=1800)
 #wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,#word could not fit if canvas too small
 wordcloud(ap.d$word,ap.d$freq, scale=c(10,0.9),min.freq=4,
 max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
 dev.off()
	install.packages("RWeka")
	install.packages("rJava")
	install.packages("Snowball")

	library(RXKCD)
	library(XML)
	library(tm)
	library(wordcloud)
	library(RColorBrewer)
	library(rJava)
	library(RWeka)
	library(Snowball)

	require(XML)
	require(RXKCD)
	require(tm)
	require(wordcloud)
	require(RColorBrewer)
	require(rJava)
	require(RWeka)
	require(Snowball)

	#install.packages("Snowball")#use by stemming of a word
	#install.packages("rJava")#need rWeka

	##Note: there are some handy, basic Twitter related functions here:
	##https://github.com/matteoredaelli/twitter-r-utils
	#For example:
	#RemoveAtPeople <- function(tweet) {
	# gsub("@\\w+", "", tweet)
	#}
	#Then for example, remove @'d names
	#tweets <- as.vector(sapply(tw.df$text, RemoveAtPeople))

	ap.df<-read.csv("sjteo links posted.csv",header = TRUE, sep="," ,
	stringsAsFactor=FALSE,na.strings = c("NA","","NULL"))

	attach(ap.df)#changes to data will not update. detach then attach to reflect changes

	#check col name
	names(ap.df)
	#check particular col
	ap.df$url#disp all data in col url
	ap.df$url[766]#disp data[766] in col url

	ap.sub1<-ap.df[,c(3,4,5,6)]
	#3 is owner comment,4,time created, 5 title,6 summary
	ap.sub1[,c(1)]#comment by me
	ap.sub1[,c(3)]#title by me
	ap.sub1[,c(4)]#summary text by ori source


	#ap.df<-read.csv("sjteo inbox1.csv",header = TRUE, stringsAsFactor=FALSE)
	#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 3])))#title of post
	#ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 1])))#comment by me
	ap.corpus <- Corpus(DataframeSource(data.frame(ap.sub1[, 4])))#summary by ori


	ap.corpus <- tm_map(ap.corpus, removeNumbers)
	#ap.corpus <- tm_map(ap.corpus, stripWhitespace)
	ap.corpus <- tm_map(ap.corpus, tolower)
	ap.corpus <- tm_map(ap.corpus,stemDocument)#preserve root words
	#cant stem coz rJava No CurrentVersion entry in key
	ap.corpus <- tm_map(ap.corpus, removePunctuation)


	#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
	# remove generic and custom stopwords
	my_stopwords <- c(stopwords("english"),"quot","quotquotquot","null","sj","teo", "pls","mrchua","ill","dun","lol","btw","dont","yeah",
	"thx","name","word","nope","teo","dad","nov","earlier","cockney","time","sir","okok","lady",
	"coz","guys","using","girls","ercan","didnt","etc","lots","hehehe","stuff","hows","previous","shld"
	,"haha","cedric","anyway","okie","sorry","timeline","photos","http","www","com","chk","amp")
	ap.corpus <- tm_map(ap.corpus, removeWords, my_stopwords)


	inspect(ap.corpus)#debug only
	ap.tdm <- TermDocumentMatrix(ap.corpus)
	#inspect(ap.tdm[1:100,1:10])#debug only
	#head(ap.tdm)#debug only
	#names(ap.tdm)#debug only

	ap.m <- as.matrix(ap.tdm)
	ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
	ap.d <- data.frame(word = names(ap.v),freq=ap.v)
	table(ap.d$freq)
	pal2 <- brewer.pal(8,"Dark2")


	#find frequent terms
	findFreqTerms(ap.tdm, 5)

	#find correlations
	findAssocs(ap.tdm,"arduino",0.20)
	findAssocs(ap.tdm,"makers",0.35)

	#clustering
	#d<-dist(ap.df, method="euclidean")#dissimilarity matrix
	#clusters<-hclust(d=d,method="ward")#ward's method to find clusters


	png("STEMM owner summary ori.png", width=3280,height=1800)
	#png("owner comment.png", width=2280,height=1800)
	#wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,#word could not fit if canvas too small
	wordcloud(ap.d$word,ap.d$freq, scale=c(10,0.9),min.freq=4,
	max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
	dev.off()
No results found