Last active
September 18, 2024 20:43
-
-
Save bryangoodrich/7b5ef683ce8db592669e to your computer and use it in GitHub Desktop.
Twitter Topic Modeling Using R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Twitter Topic Modeling Using R | |
# Author: Bryan Goodrich | |
# Date Created: February 13, 2015 | |
# Last Modified: April 3, 2015 | |
# | |
# Use twitteR API to query Twitter, parse the search result, and | |
# perform a series of topic models for identifying potentially | |
# useful topics from your query content. This has applications for | |
# social media, research, or general curiosity | |
# | |
# Reference | |
# http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/ | |
# https://blog.credera.com/business-intelligence/twitter-analytics-using-r-part-2-create-word-cloud/ | |
# Load relevant libraries | |
# Use install.packages("package") to install any you don't have | |
# If, like me, you're on Linux and cannot install the topicmodels | |
# package because of something about missing a GSL component. You'll | |
# need to install the library and development library. If, like me, | |
# you're on a Debian based system, it's pretty easy | |
# sudo apt-get install libgsl0ldbl libgsl0-dev | |
library(twitteR) | |
library(NLP) | |
library(tm) | |
library(RColorBrewer) | |
library(wordcloud) | |
library(topicmodels) | |
library(SnowballC) | |
# Populate with your https://apps.twitter.com application information | |
# to use my_setup() to register your session. No, you cannot see mine! | |
my_setup <- function() { | |
ckey = "Your Consumer Key Here" | |
csec = "Your Consumer Secret Here" | |
akey = "Your Access Token Here" | |
asec = "Your Access Token Secret Here" | |
setup_twitter_oauth(ckey, csec, akey, asec) | |
} | |
# Convenience function for accessing the text part of a tweet | |
# returned by the twitteR API. Is used below. | |
tweet_text <- function(x) x$getText() | |
# Submit a search query (terms separated by "+") and get a return | |
# set of data (corpus). | |
tweet_corpus <- function(search, n = 5000, ...) { | |
payload <- searchTwitter(search, n = n, ...) | |
sapply(payload, tweet_text) | |
} | |
# Search for some key terms, try to grab a lot if you want. Twitter will | |
# limit you as it sees fit (can find). Also has spatial options. | |
# Try these Sacramento coordinates: '38.630404,-121.293535,50mi' | |
my_setup() # Setup the login for this session | |
tweets <- tweet_corpus("energy+electricity", n = 10000, lang = 'en') | |
# Save your corpus (because you're limited in how often you can do this for free!) | |
saveRDS(tweets, file = "Workspace/energy_themes/tweets.Rds", compress = 'xz') | |
# Okay, read that corpus back in from disk. I'm sure you have a | |
# different save location, right? | |
tweets <- readRDS("Workspace/energy_themes/tweets.Rds") | |
# Okay, here's where things get tricky. See references for examples. | |
# Problems? Depends if your system is using parallel processing. If | |
# it is, you'll need to use mc.cores parameters as shown later. That | |
# took me awhile to get! Thanks to the references for clearing that up. | |
# Here we pre-process the data in some standard ways. I'll post-define each step | |
tweets <- iconv(tweets, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters | |
tweets <- tolower(tweets) # Make everything consistently lower case | |
tweets <- gsub("rt", " ", tweets) # Remove the "RT" (retweet) so duplicates are duplicates | |
tweets <- gsub("@\\w+", " ", tweets) # Remove user names (all proper names if you're wise!) | |
tweets <- gsub("http.+ |http.+$", " ", tweets) # Remove links | |
tweets <- gsub("[[:punct:]]", " ", tweets) # Remove punctuation | |
tweets <- gsub("[ |\t]{2,}", " ", tweets) # Remove tabs | |
tweets <- gsub("amp", " ", tweets) # "&" is "&" in HTML, so after punctuation removed ... | |
tweets <- gsub("^ ", "", tweets) # Leading blanks | |
tweets <- gsub(" $", "", tweets) # Lagging blanks | |
tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?) | |
tweets <- unique(tweets) # Now get rid of duplicates! | |
# Convert to tm corpus and use its API for some additional fun | |
corpus <- Corpus(VectorSource(tweets)) # Create corpus object | |
# Remove English stop words. This could be greatly expanded! | |
# Don't forget the mc.cores thing | |
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1) | |
# Remove numbers. This could have been done earlier, of course. | |
corpus <- tm_map(corpus, removeNumbers, mc.cores=1) | |
# Stem the words. Google if you don't understand | |
corpus <- tm_map(corpus, stemDocument, mc.cores=1) | |
# Remove the stems associated with our search terms! | |
corpus <- tm_map(corpus, removeWords, c("energi", "electr"), mc.cores=1) | |
# Why not visualize the corpus now? | |
# Mine had a lot to do with {solar, power, renew, new, can, save, suppl, wind, price, use} | |
pal <- brewer.pal(8, "Dark2") | |
wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal) | |
# Now for Topic Modeling | |
# Get the lengths and make sure we only create a DTM for tweets with | |
# some actual content | |
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus))) | |
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0]) | |
# model <- LDA(dtm, 10) # Go ahead and test a simple model if you want | |
# Now for some topics | |
SEED = sample(1:1000000, 1) # Pick a random seed for replication | |
k = 10 # Let's start with 10 topics | |
# This might take a minute! | |
models <- list( | |
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))), | |
VEM = LDA(dtm, k = k, control = list(seed = SEED)), | |
VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)), | |
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000, | |
thin = 100, iter = 1000)) | |
) | |
# There you have it. Models now holds 4 topics. See the topicmodels API documentation for details | |
# Top 10 terms of each topic for each model | |
# Do you see any themes you can label to these "topics" (lists of words)? | |
lapply(models, terms, 10) | |
# matrix of tweet assignments to predominate topic on that tweet | |
# for each of the models, in case you wanted to categorize them | |
assignments <- sapply(models, topics) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment