Skip to content

Instantly share code, notes, and snippets.

@epijim
Created January 10, 2014 12:25
Show Gist options
  • Save epijim/8351095 to your computer and use it in GitHub Desktop.
Save epijim/8351095 to your computer and use it in GitHub Desktop.
access twitter API from R and make a word cloud
setwd("~/R/RStudio/twitterwordcloud")
#install the necessary packages
#install.packages("ROAuth")
#install.packages("twitteR")
#install.packages("wordcloud")
#install.packages("tm")
library("ROAuth")
library("twitteR")
library("wordcloud")
library("tm")
#necessary step for Windows
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
#to get your consumerKey and consumerSecret see the twitteR documentation for instructions
cred <- OAuthFactory$new(consumerKey='GET FROM TWITTER',
consumerSecret='GET FROM TWITTER',
requestURL='https://api.twitter.com/oauth/request_token',
accessURL='http://api.twitter.com/oauth/access_token',
authURL='http://api.twitter.com/oauth/authorize')
#necessary step for Windows
#cred$handshake(cainfo="cacert.pem")
#save for later use for Windows
save(cred, file="twitter authentication.Rdata")
registerTwitterOAuth(cred)
#the cainfo parameter is necessary on Windows
r_stats<- searchTwitter("#Whisky", n=1500, cainfo="cacert.pem")
#save text
r_stats_text <- sapply(r_stats, function(x) x$getText())
#create corpus
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text))
#clean up
r_stats_text_corpus <- tm_map(r_stats_text_corpus, tolower)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords()))
library(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(r_stats_text_corpus,min.freq=20,max.words=100, random.order=T, colors=pal2)
#NEW ANALAYSIS
#Retrieve text
collTweets <- userTimeline("JesusCaff", n=1500 , cainfo="cacert.pem")
n <- length(collTweets)
collTweets[1:3]
#Transforming Text
df <- do.call("rbind", lapply(collTweets, as.data.frame))
dim(df)
library(tm)
# build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))
#After that, the corpus needs a couple of transformations,
#including changing letters to lower case, removing punctuations/numbers
#and removing stop words. The general English stop-word list is tailored
#by adding "available" and "via" and removing "r".
myCorpus <- tm_map(myCorpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
#remove links
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
# remove stopwords
myStopwords <- c(stopwords('english'),"part","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet")
myCorpus <- tm_map(myCorpus, removeURL)
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#In many cases, words need to be stemmed to retrieve their radicals.
#For instance, "example" and "examples" are both stemmed to "exampl".
#However, after that, one may want to complete the stems to their
#original forms, so that the words would look "normal".
dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
#out as working weird...
#myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first three ``documents"
inspect(myCorpus[1:3])
# stem completion
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
inspect(myCorpus[1:3])
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[266:270,31:40])
#Frequent terms and associations
findFreqTerms(myDtm, lowfreq=10)
# which words are associated with "fellow"?
findAssocs(myDtm, 'fellow', 0.30)
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq,min.freq=3,max.words=100, random.order=T, colors=pal2)
#NEW ANALAYSIS
#Retrieve text
collTweets <- searchTwitter("#Whisky", n=1500 , cainfo="cacert.pem")
n <- length(collTweets)
collTweets[1:3]
#Transforming Text
df <- do.call("rbind", lapply(collTweets, as.data.frame))
dim(df)
library(tm)
# build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))
#After that, the corpus needs a couple of transformations,
#including changing letters to lower case, removing punctuations/numbers
#and removing stop words. The general English stop-word list is tailored
#by adding "available" and "via" and removing "r".
myCorpus <- tm_map(myCorpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
#remove links
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
# remove stopwords
myStopwords <- c(stopwords('english'),"part","whisky","whiskey","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet")
myCorpus <- tm_map(myCorpus, removeURL)
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#In many cases, words need to be stemmed to retrieve their radicals.
#For instance, "example" and "examples" are both stemmed to "exampl".
#However, after that, one may want to complete the stems to their
#original forms, so that the words would look "normal".
dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
#out as working weird...
#myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first three ``documents"
inspect(myCorpus[1:3])
# stem completion
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
inspect(myCorpus[1:3])
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[266:270,31:40])
#Frequent terms and associations
findFreqTerms(myDtm, lowfreq=10)
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq,min.freq=18,max.words=100, random.order=T, colors=pal2)
#NEW ANALAYSIS
#Retrieve text
collTweets <- searchTwitter("#Whiskey", n=1500 , cainfo="cacert.pem")
n <- length(collTweets)
collTweets[1:3]
#Transforming Text
df <- do.call("rbind", lapply(collTweets, as.data.frame))
dim(df)
library(tm)
# build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))
#After that, the corpus needs a couple of transformations,
#including changing letters to lower case, removing punctuations/numbers
#and removing stop words. The general English stop-word list is tailored
#by adding "available" and "via" and removing "r".
myCorpus <- tm_map(myCorpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
#remove links
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
# remove stopwords
myStopwords <- c(stopwords('english'),"part","whisky","whiskey","new","dont","will","close","along","held","get","current", "available", "via", "the","see","now","amp","yet")
myCorpus <- tm_map(myCorpus, removeURL)
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#In many cases, words need to be stemmed to retrieve their radicals.
#For instance, "example" and "examples" are both stemmed to "exampl".
#However, after that, one may want to complete the stems to their
#original forms, so that the words would look "normal".
dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
#out as working weird...
#myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first three ``documents"
inspect(myCorpus[1:3])
# stem completion
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
inspect(myCorpus[1:3])
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[266:270,31:40])
#Frequent terms and associations
findFreqTerms(myDtm, lowfreq=10)
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq,min.freq=10,max.words=100, random.order=T, colors=pal2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment