Created
January 10, 2014 12:25
-
-
Save epijim/8351095 to your computer and use it in GitHub Desktop.
access twitter API from R and make a word cloud
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("~/R/RStudio/twitterwordcloud") | |
#install the necessary packages | |
#install.packages("ROAuth") | |
#install.packages("twitteR") | |
#install.packages("wordcloud") | |
#install.packages("tm") | |
library("ROAuth") | |
library("twitteR") | |
library("wordcloud") | |
library("tm") | |
#necessary step for Windows | |
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") | |
#to get your consumerKey and consumerSecret see the twitteR documentation for instructions | |
cred <- OAuthFactory$new(consumerKey='GET FROM TWITTER', | |
consumerSecret='GET FROM TWITTER', | |
requestURL='https://api.twitter.com/oauth/request_token', | |
accessURL='http://api.twitter.com/oauth/access_token', | |
authURL='http://api.twitter.com/oauth/authorize') | |
#necessary step for Windows | |
#cred$handshake(cainfo="cacert.pem") | |
#save for later use for Windows | |
save(cred, file="twitter authentication.Rdata") | |
registerTwitterOAuth(cred) | |
#the cainfo parameter is necessary on Windows | |
r_stats<- searchTwitter("#Whisky", n=1500, cainfo="cacert.pem") | |
#save text | |
r_stats_text <- sapply(r_stats, function(x) x$getText()) | |
#create corpus | |
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text)) | |
#clean up | |
r_stats_text_corpus <- tm_map(r_stats_text_corpus, tolower) | |
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation) | |
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords())) | |
library(RColorBrewer) | |
pal2 <- brewer.pal(8,"Dark2") | |
wordcloud(r_stats_text_corpus,min.freq=20,max.words=100, random.order=T, colors=pal2) | |
#NEW ANALAYSIS | |
#Retrieve text | |
collTweets <- userTimeline("JesusCaff", n=1500 , cainfo="cacert.pem") | |
n <- length(collTweets) | |
collTweets[1:3] | |
#Transforming Text | |
df <- do.call("rbind", lapply(collTweets, as.data.frame)) | |
dim(df) | |
library(tm) | |
# build a corpus, which is a collection of text documents | |
# VectorSource specifies that the source is character vectors. | |
myCorpus <- Corpus(VectorSource(df$text)) | |
#After that, the corpus needs a couple of transformations, | |
#including changing letters to lower case, removing punctuations/numbers | |
#and removing stop words. The general English stop-word list is tailored | |
#by adding "available" and "via" and removing "r". | |
myCorpus <- tm_map(myCorpus, tolower) | |
# remove punctuation | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
# remove numbers | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
#remove links | |
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) | |
# remove stopwords | |
myStopwords <- c(stopwords('english'),"part","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet") | |
myCorpus <- tm_map(myCorpus, removeURL) | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
#In many cases, words need to be stemmed to retrieve their radicals. | |
#For instance, "example" and "examples" are both stemmed to "exampl". | |
#However, after that, one may want to complete the stems to their | |
#original forms, so that the words would look "normal". | |
dictCorpus <- myCorpus | |
# stem words in a text document with the snowball stemmers, | |
# which requires packages Snowball, RWeka, rJava, RWekajars | |
#out as working weird... | |
#myCorpus <- tm_map(myCorpus, stemDocument) | |
# inspect the first three ``documents" | |
inspect(myCorpus[1:3]) | |
# stem completion | |
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) | |
inspect(myCorpus[1:3]) | |
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) | |
inspect(myDtm[266:270,31:40]) | |
#Frequent terms and associations | |
findFreqTerms(myDtm, lowfreq=10) | |
# which words are associated with "fellow"? | |
findAssocs(myDtm, 'fellow', 0.30) | |
m <- as.matrix(myDtm) | |
# calculate the frequency of words | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
d <- data.frame(word=myNames, freq=v) | |
wordcloud(d$word, d$freq,min.freq=3,max.words=100, random.order=T, colors=pal2) | |
#NEW ANALAYSIS | |
#Retrieve text | |
collTweets <- searchTwitter("#Whisky", n=1500 , cainfo="cacert.pem") | |
n <- length(collTweets) | |
collTweets[1:3] | |
#Transforming Text | |
df <- do.call("rbind", lapply(collTweets, as.data.frame)) | |
dim(df) | |
library(tm) | |
# build a corpus, which is a collection of text documents | |
# VectorSource specifies that the source is character vectors. | |
myCorpus <- Corpus(VectorSource(df$text)) | |
#After that, the corpus needs a couple of transformations, | |
#including changing letters to lower case, removing punctuations/numbers | |
#and removing stop words. The general English stop-word list is tailored | |
#by adding "available" and "via" and removing "r". | |
myCorpus <- tm_map(myCorpus, tolower) | |
# remove punctuation | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
# remove numbers | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
#remove links | |
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) | |
# remove stopwords | |
myStopwords <- c(stopwords('english'),"part","whisky","whiskey","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet") | |
myCorpus <- tm_map(myCorpus, removeURL) | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
#In many cases, words need to be stemmed to retrieve their radicals. | |
#For instance, "example" and "examples" are both stemmed to "exampl". | |
#However, after that, one may want to complete the stems to their | |
#original forms, so that the words would look "normal". | |
dictCorpus <- myCorpus | |
# stem words in a text document with the snowball stemmers, | |
# which requires packages Snowball, RWeka, rJava, RWekajars | |
#out as working weird... | |
#myCorpus <- tm_map(myCorpus, stemDocument) | |
# inspect the first three ``documents" | |
inspect(myCorpus[1:3]) | |
# stem completion | |
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) | |
inspect(myCorpus[1:3]) | |
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) | |
inspect(myDtm[266:270,31:40]) | |
#Frequent terms and associations | |
findFreqTerms(myDtm, lowfreq=10) | |
m <- as.matrix(myDtm) | |
# calculate the frequency of words | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
d <- data.frame(word=myNames, freq=v) | |
wordcloud(d$word, d$freq,min.freq=18,max.words=100, random.order=T, colors=pal2) | |
#NEW ANALAYSIS | |
#Retrieve text | |
collTweets <- searchTwitter("#Whiskey", n=1500 , cainfo="cacert.pem") | |
n <- length(collTweets) | |
collTweets[1:3] | |
#Transforming Text | |
df <- do.call("rbind", lapply(collTweets, as.data.frame)) | |
dim(df) | |
library(tm) | |
# build a corpus, which is a collection of text documents | |
# VectorSource specifies that the source is character vectors. | |
myCorpus <- Corpus(VectorSource(df$text)) | |
#After that, the corpus needs a couple of transformations, | |
#including changing letters to lower case, removing punctuations/numbers | |
#and removing stop words. The general English stop-word list is tailored | |
#by adding "available" and "via" and removing "r". | |
myCorpus <- tm_map(myCorpus, tolower) | |
# remove punctuation | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
# remove numbers | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
#remove links | |
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) | |
# remove stopwords | |
myStopwords <- c(stopwords('english'),"part","whisky","whiskey","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet") | |
myCorpus <- tm_map(myCorpus, removeURL) | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
#In many cases, words need to be stemmed to retrieve their radicals. | |
#For instance, "example" and "examples" are both stemmed to "exampl". | |
#However, after that, one may want to complete the stems to their | |
#original forms, so that the words would look "normal". | |
dictCorpus <- myCorpus | |
# stem words in a text document with the snowball stemmers, | |
# which requires packages Snowball, RWeka, rJava, RWekajars | |
#out as working weird... | |
#myCorpus <- tm_map(myCorpus, stemDocument) | |
# inspect the first three ``documents" | |
inspect(myCorpus[1:3]) | |
# stem completion | |
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) | |
inspect(myCorpus[1:3]) | |
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) | |
inspect(myDtm[266:270,31:40]) | |
#Frequent terms and associations | |
findFreqTerms(myDtm, lowfreq=10) | |
m <- as.matrix(myDtm) | |
# calculate the frequency of words | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
d <- data.frame(word=myNames, freq=v) | |
wordcloud(d$word, d$freq,min.freq=10,max.words=100, random.order=T, colors=pal2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment