Created
November 9, 2015 05:12
-
-
Save Teino1978-Corp/43d6eb472ca558bac0b8 to your computer and use it in GitHub Desktop.
2015 New Year Resolution-Capstone peroject
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ##################################################################### | |
| # STEP 1: INSTALL REQUIRED PACKAGES AND CONNECT TO ALL LIBRARIES | |
| ##################################################################### | |
| library("NLP") | |
| library("tm") | |
| library("RCurl") | |
| library("slam") | |
| library(sentiment) | |
| library(gridExtra) | |
| library(devtools) | |
| library(plyr) | |
| library(dplyr) | |
| library(stringr) | |
| library(ggplot2) | |
| library(reshape2) | |
| library(wordcloud) | |
| library(RColorBrewer) | |
| library(bitops) | |
| library(RWeka) | |
| library(klaR) | |
| library(caret) | |
| library(Rstem) | |
| library(SnowballC) | |
| library(twitteR) | |
| library(e1071) | |
| require(rJava) | |
| #require(Rwekajars) | |
| ##################################################################### | |
| # STEP 2: IMPORT CSV FILE TO R ENVIRONMENT | |
| ##################################################################### | |
| ## STEP 2.1: Get current working directory and set working directory | |
| ## (avoid entering the full path of the data files) | |
| getwd() | |
| #read.csv(new_res) | |
| Res <- new_res | |
| ## STEP 2.2: Check data class, summary and others | |
| head(Res,3) | |
| str(Res) | |
| summary(Res) | |
| names(Res) | |
| ncol(Res) | |
| ##### ----------------------------------------------------------- ##### | |
| # STEP 3: PREPARE THE TEXT FOR SENTIMENT ANALYSIS | |
| ##################################################################### | |
| ## STEP 3.1: Create clean function to preprocess the content | |
| clean.text <- function(some_txt) | |
| { | |
| ### STEP 3.1(a): Remove punctuation, links | |
| some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) | |
| some_txt = gsub("@\\w+", "", some_txt) | |
| some_txt = gsub("[[:punct:]]", "", some_txt) | |
| some_txt = gsub("[[:digit:]]", "", some_txt) | |
| some_txt = gsub("http\\w+", "", some_txt) | |
| some_txt = gsub("[ \t]{2,}", "", some_txt) | |
| some_txt = gsub("^\\s+|\\s+$", "", some_txt) | |
| ### STEP 3.1(b): Remove non-english characters | |
| some_txt = gsub("[^\x20-\x7E]", "", some_txt) | |
| ### STEP 3.1(c): Define "tolower error handling" function | |
| try.tolower = function(x) | |
| { | |
| y = NA | |
| try_error = tryCatch(tolower(x), error=function(e) e) | |
| if (!inherits(try_error, "error")) | |
| y = tolower(x) | |
| return(y) | |
| } | |
| some_txt = sapply(some_txt, try.tolower) | |
| some_txt = some_txt[some_txt != ""] | |
| names(some_txt) = NULL | |
| return(some_txt) | |
| } | |
| ### STEP 3.1(d): Clean the content with "clean.text" function & remove qoute | |
| Res$text_new <- clean.text(Res$text) | |
| class(Res) | |
| str(Res) | |
| ##### ----------------------------------------------------------- ##### | |
| str(Res$text_new) # check the class | |
| head(Res$text_new,3) # 1st 3 text | |
| Res$text_new <- noquote(Res$text_new) # omit qoute from text | |
| ### STEP 3.2: Convert other columns & combine new column | |
| Res$tweet_date_new<-as.Date(as.POSIXlt(Res$tweet_date)) | |
| Res$tweet_created_new<-as.Date(as.POSIXlt(Res$tweet_created)) | |
| Res$userid_new<-as.numeric(Res$tweet_id) | |
| Res$retweet_new<-as.numeric(Res$retweet_count, Res$tweet_id) | |
| Res$other_topic_new<-as.character(Res$other_topic) | |
| Res$resolution_topics_new<-as.character(Res$resolution_topics) | |
| Res$gender_new<-as.character(Res$gender) | |
| Res$name_new<-as.character(Res$name) | |
| Res$resolution_Category_new<-as.character(Res$Resolution_Category) | |
| Res$text_new<-as.character(Res$text_new) | |
| Res$location_new<-as.character(Res$tweet_location) | |
| Res$state_new<-as.character(Res$tweet_state) | |
| Res$user_timezone_new<-as.character(Res$user_timezone) | |
| Res$tweet_region_new<-as.character(Res$tweet_region) | |
| head(Res) | |
| ### STEP 3.3: Convert to dataframe, removing duplicates & check new table | |
| Res2<-as.data.frame(Res) | |
| unique(Res2) # extracting uniqe elements with removing duplicate | |
| head(Res2,3) | |
| str(Res2) | |
| dim(Res2) | |
| summary(Res2) | |
| ##################################################################### | |
| # STEP 4: PERFORM SENTIMENT ANALYSIS OF TWEETS | |
| # (BY EMOTIONAL & POLARITY CATEGORIES: Method- Learning Based) | |
| ##################################################################### | |
| ## STEP 4.1: Classify emotion | |
| Res2_emo = classify_emotion(Res2$text_new, algorithm="bayes", prior=1.0) | |
| ### STEP 4.1.1: Get emotion best fit | |
| emotion = Res2_emo[,7] | |
| ### STEP 4.1.2: Substitute NA's by "unknown" | |
| emotion[is.na(emotion)] = "unknown" | |
| ## STEP 4.2: Classify polarity | |
| Res2_pol = classify_polarity(Res2$text_new, algorithm="bayes") | |
| ### STEP 4.2.1 Get polarity best fit | |
| polarity = Res2_pol[,4] | |
| ## STEP 4.3: Create data frame to obtain some general statistics | |
| ### STEP 4.3.1: Data frame with results | |
| Res2_sentiment = data.frame(text=Res2$text_new, emotion=emotion, | |
| polarity=polarity, stringsAsFactors=FALSE) | |
| ### STEP 4.3.2: Rearrange dataset by sorting data frame | |
| Res2_sentiment = within(Res2_sentiment, | |
| emotion <- factor(emotion, levels=names(sort(table(emotion), | |
| decreasing=TRUE)))) | |
| ### STEP 4.4: Perform data visualization | |
| #### a) Plot distribution of emotions | |
| ggplot(Res2_sentiment, aes(x=emotion))+geom_bar(aes(y=..count.., fill=emotion)) + | |
| xlab("Emotion Categories")+ylab("Number of Tweets")+ | |
| ggtitle("SENTIMENT ANALYSIS OF TWEETS ON Resolution(Emotions)") | |
| #### b) Plot distribution of polarity | |
| ggplot(Res2_sentiment, aes(x=polarity)) + geom_bar(aes(y=..count.., fill=polarity)) + | |
| xlab("Polarity Categories") + ylab("Number of Tweets") + | |
| ggtitle("SENTIMENT ANALYSIS OF TWEETS ON Resolution(Polarity)") | |
| ##################################################################### | |
| #################################################################### | |
| # STEP 5: PERFORM COMPARISON CLOUD TO VISUALIZE THE WORDS | |
| # & FIND MOST FREQUENT WORD OCCUR | |
| #################################################################### | |
| ## STEP 5.1: Separate text by emotion | |
| emos <- levels(factor(Res2_sentiment$emotion)) | |
| nemo <- length(emos) | |
| emo.docs <- rep("", nemo) | |
| for (i in 1:nemo) | |
| { | |
| tmp <- Res2_sentiment$text[emotion == emos[i]] | |
| emo.docs[i] <- paste(tmp, collapse=" ") | |
| } | |
| stopwords("english") ##--- Check stopwords | |
| emo.docs <- removeWords(emo.docs, stopwords("english")) | |
| class(emo.docs) | |
| corpus <- Corpus(VectorSource(emo.docs)) | |
| mycorpus<-corpus | |
| corpus <- tm_map(corpus, stemDocument, language = "english") | |
| corpus <- tm_map(corpus, stemCompletion, dictionary = mycorpus) | |
| inspect(corpus) | |
| ## STEP 5.2: Create Term Document Matrix | |
| tdm <- TermDocumentMatrix(corpus) | |
| colnames(tdm) <- emos | |
| class(tdm) | |
| ## STEP 5.3: Convert tdm into matrix | |
| tdm <- as.matrix(tdm) | |
| class(tdm) | |
| ## STEP 5.4: Save as csv file | |
| write.csv(tdm, 'Res2_words_stem.csv') | |
| ## STEP 5.5: Find most frequent term occurances | |
| max(apply(tdm,1,sum)) | |
| which(apply(tdm,1,sum)==1312) | |
| which(apply(tdm,1,sum)>300) | |
| ## STEP 5.6: Get word counts in decreasing order of 1st 15 term | |
| word_freqs <- sort(rowSums(tdm), decreasing=TRUE)[1:20] | |
| ## STEP 5.7: Create a data frame with words and their frequencies | |
| dm <- data.frame(word=names(word_freqs), freq=word_freqs) | |
| elec_cand <- ggplot(subset(dm, freq>20), aes(word, freq)) | |
| elec_cand <- elec_cand+ geom_bar(stat="identity") #? | |
| elec_cand <- elec_cand+ theme(axis.text.x=element_text(angle=45, hjust=1)) #? | |
| ### STEP 5.7(a): Visualize most word appear in text | |
| wordcloud(dm$word, dm$freq, random.order=FALSE, | |
| colors=brewer.pal(6, "Dark2"), min.freq=10, | |
| scale=c(4,.2),rot.per=.15) | |
| ### STEP 5.7(b): Visualize most word appear in text based on emotion category | |
| comparison.cloud(tdm, colors=brewer.pal(nemo, "Dark2"), | |
| scale=c(3,.5), rot.per=.15, | |
| random.order=FALSE, title.size=1.5) | |
| ##################################################################### | |
| # STEP 6: TEXT MINING | |
| ##################################################################### | |
| ## STEP 6.1: Transforming Text | |
| ### STEP 6.1.1: Build a corpus, which is a collection of text documents | |
| # VectorSource specifies that the source is character vectors | |
| df <- data.frame(V1 = Res2$text_new, stringsAsFactors = FALSE) | |
| mycorpus <- Corpus(VectorSource(Res2$text_new)) | |
| ### STEP 6.1.2: Preprocessing Text | |
| clean.corpus<-function(mycorpus) | |
| { | |
| # i) Change letters to lower case | |
| # ii) remove punctuations, numbers | |
| # iii) remove stop words, whitespace | |
| mycorpus = tm_map(mycorpus, removePunctuation) | |
| mycorpus = tm_map(mycorpus, stripWhitespace) | |
| mycorpus = tm_map(mycorpus, content_transformer(tolower)) | |
| # The general English stop-word list is tailored by adding | |
| # "available" and "via" and removing "r" | |
| myStopwords <- c(stopwords('english'), "available", "via") | |
| idx <- which(myStopwords == "r") | |
| myStopwords <- myStopwords[-idx] | |
| mycorpus <- tm_map(mycorpus, removeWords, myStopwords) | |
| return(mycorpus) | |
| } | |
| Res2.corpus = clean.corpus(mycorpus) | |
| ## STEP 6.2: Stemming words to retrieve the root form, | |
| # so that words look normal | |
| ### STEP 6.2.2: Create copy of corpus to use later as a dictionary | |
| # for stem completion | |
| dictcorpus <- Res2.corpus | |
| ### STEP 6.2.3: Perform stemDocument & inspect 1st 3 doc. | |
| Res2.corpus <- tm_map(Res2.corpus, stemDocument) | |
| inspect(Res2.corpus[1:3]) #display detailed information on corpus or tdm | |
| ### STEP 6.2.4: Stem completion & print 1st 3 doc. in the built corpus | |
| Res2.corpus <- tm_map(Res2.corpus, stemCompletion, dictionary=dictcorpus) | |
| inspect(Res2.corpus[1:3]) | |
| ### STEP 6.2.5: Build Term-Document Matrix | |
| tdm <- TermDocumentMatrix(corpus) | |
| colnames(tdm) <- emos | |
| class(tdm) | |
| ## STEP 6.3: Build td Matrix from corpus & find list of words | |
| myDtm <- TermDocumentMatrix(Res2.corpus, control = list(minWordLength = 1)) | |
| inspect(myDtm[560:570,100:110]) | |
| findFreqTerms(myDtm, lowfreq=40) | |
| findAssocs(myDtm, 'dirltr', 0.70) | |
| ##################################################################### | |
| # STEP 7: TEXT MINING-lassification using Naive Bayes and cleaned texts without tokenizing | |
| ##################################################################### | |
| RRR<- Res2_sentiment | |
| as.matrix(RRR) | |
| rownames (RRR)<- NULL | |
| data_final<- t(as matix(U)) | |
| rownames(RRR) <- NULL | |
| RRR | |
| #7.1.changing columns format to factor | |
| RRR<- data.frame(RRR, category=Res2$resolution_Category_new) #adding category (i.e. resolutions category) coloumn to texts | |
| RRR<- data.frame(RRR, gender=Res2$gender_new) # adding gender column to the text column | |
| RRR$gender<-as.factor(RRR$gender) #changing column into factor format | |
| RRR$text<-as.factor(RRR$text) | |
| RRR$emotion<-as.factor(RRR$emotion) | |
| RRR$polarity<-as.factor(RRR$polarity) | |
| RRR$category<-as.factor(RRR$category) | |
| head(RRR) | |
| str(RRR) | |
| nrow(RRR) | |
| #7.2. Text classication using polrity target | |
| rn_train <- sample(nrow(RRR), floor(nrow(RRR)*0.7)) | |
| train <- RRR[rn_train, ] | |
| test<- RRR[-rn_train, ] | |
| model<- NaiveBayes(polarity~., data=train) | |
| predictions<- predict(model, test)#make predictions | |
| confusionMatrix(predictions$class, predictions$class)#summarize results | |
| #7.3. Text classication using category (i.e. different resolutions) target | |
| rn_train <- sample(nrow(RRR), floor(nrow(RRR)*0.7)) | |
| train <- RRR[rn_train, ] | |
| test<- RRR[-rn_train, ] | |
| model<- NaiveBayes(category~., data=train) | |
| predictions<- predict(model, test) #make predictions | |
| confusionMatrix(predictions$class, predictions$class) #summarize results | |
| #8. Text calssification for different gender | |
| #8.1. dividing texts into two subsets, one for male and the other for female and running Naive bayes for each subset | |
| maleR<- subset(RRR, gender=="male", select=c(text,emotion, polarity, category,gender)) | |
| femaleR<- subset(RRR, gender=="female", select=c(text,emotion, polarity, category,gender)) | |
| nrow(maleR) | |
| nrow(femaleR) | |
| #8.2. Running Naive Bayes for female subset (Ladies first!) | |
| rn_train <- sample(nrow(femaleR), floor(nrow(femaleR)*0.7)) | |
| train <- femaleR[rn_train, ] | |
| test<- femaleR[-rn_train, ] | |
| model<- NaiveBayes(category~., data=train) | |
| predictions<- predict(model, test) #make predictions | |
| confusionMatrix(predictions$class, predictions$class) #summarize results | |
| #8.3. Running Naive Bayes for male subset | |
| rn_train <- sample(nrow(maleR), floor(nrow(maleR)*0.7)) | |
| train <- maleR[rn_train, ] | |
| test<- maleR[-rn_train, ] | |
| model<- NaiveBayes(category~., data=train) | |
| predictions<- predict(model, test) #make predictions | |
| confusionMatrix(predictions$class, predictions$class) #summarize results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment