Created
November 16, 2014 01:22
-
-
Save Inpirical-Coder/aa8a93566f9e7404e4fe to your computer and use it in GitHub Desktop.
Simple script to download, scrub and classify Tweets according to polarity and emotion using a simple Bayes classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Simple script for doing some data-analysis of tweets; | |
# looking at "sentiment" and "emotion" using the sentiment package. | |
# see https://sites.google.com/site/miningtwitter/questions/sentiment/sentiment | |
# for background. | |
# SETTINGS | |
# ============================================================================= | |
authenticated = TRUE # If TRUE will load credential from file. | |
tweets.from.file = TRUE # If TRUE will load tweets from file rather than query. | |
no.tweets = 1500 # Number of tweets to fetch in every search; <= 1,500. | |
# Define the list of terms to query twitter about. | |
tweet.terms = c( | |
"deutschebank", | |
"goldmansachs", | |
"jpmorgan", | |
"ubs", | |
"creditsuisse", | |
"wellsfargo", | |
"hsbc", | |
"pimco", | |
"moodysratings", | |
"fitchratings", | |
"aiginsurance", | |
"fanniemae" | |
) | |
language = "en" # Define the language you want tweets in. | |
# DEPENDENCIES (Packages and source files) | |
# ============================================================================= | |
# Load your Twitter API keys; needed for authentication. Must define two variables: | |
# "consumer.key" and "consumer.secret". | |
source("twitter_api_keys.R") | |
InstallArchives = function() { | |
# To install the "sentiment" package and also its "Rstem" dependency. | |
# Neither package is on current CRAN, therefore download from the archives. | |
# URL of the CRAN repo | |
repo.url = "http://cran.r-project.org/src/contrib/Archive/" | |
# URL tails of the packages we want to install | |
pack.urls = c( | |
"Rstem/Rstem_0.4-1.tar.gz", | |
"sentiment/sentiment_0.2.tar.gz" | |
) | |
# Install the packages. | |
lapply(pack.urls, function(pack.url) { | |
install.packages(paste0(repo.url, pack.url), repos=NULL) | |
}) | |
} | |
# Install the Rstem and sentiment packages if not installed. | |
if(!("sentiment" %in% installed.packages())) {InstallArchives()} | |
# Define the dependency packages we need. | |
required.packs = c("twitteR", | |
"sentiment", # Sentiment analysis. | |
"tm", # Text mining. | |
"plyr", # Splitting, plotting, combining data. | |
"ggplot2", # Plotting. | |
"wordcloud", # Create wordclouds. | |
"data.table", # Data tables. | |
"RColorBrewer" # Palettes for visualisation. | |
) | |
# Install the required packages if missing, then load them. | |
sapply(required.packs, function(pack) { | |
if(!(pack %in% installed.packages())) {install.packages(pack)} | |
require(pack, character.only=TRUE) | |
}) | |
print("Dependencies met [OK]") | |
# AUTHENTICATE | |
# ============================================================================= | |
TwitterAuth = function() { | |
# Function to authenticate with Twitter API. | |
# URLs needed for authentication. | |
request.url = "https://api.twitter.com/oauth/request_token" | |
access.url = "https://api.twitter.com/oauth/access_token" | |
auth.url = "https://api.twitter.com/oauth/authorize" | |
# Create a twitter credential. | |
twit.cred = OAuthFactory$new( | |
consumerKey =consumer.key, | |
consumerSecret=consumer.secret, | |
requestURL=request.url, | |
accessURL=access.url, | |
authURL=auth.url | |
) | |
twit.cred$handshake() | |
save(twit.cred, file="twit_cred.Rdat") | |
twit.cred | |
} | |
# If you have already authenticated before, just load the saved credential. | |
if(authenticated) { | |
load("twit_cred.Rdat") | |
} else { | |
twit.cred = TwitterAuth() | |
} | |
registerTwitterOAuth(twit.cred) | |
print("Authenticated with Twitter for use of API [OK]") | |
# HARVEST TWEETS | |
# ============================================================================= | |
HarvestTweets = function(tweet.terms) { | |
tweets = lapply(tweet.terms, function(i) { | |
print(paste("Getting tweets for", i)) | |
x = tryCatch(searchTwitter(i, n=no.tweets), error=function(e) NULL) | |
x = sapply(x, "[[", "text") | |
cbind(txt = x, term = i) | |
}) | |
# Bind all the tweets into one character matrix and purge duplicates. | |
unique(Reduce(rbind, tweets[sapply(tweets, nrow) > 1])) | |
} | |
# If you have already authenticated before, just load the saved credential. | |
if(tweets.from.file) { | |
load("data/tweets.Rdat") | |
print("Tweets loaded from file [OK]") | |
} else { | |
tweets = HarvestTweets(tweet.terms) | |
save(tweets, file="data/tweets.Rdat") | |
print("Tweets harvested and saved [OK]") | |
} | |
# SCRUB TWEETS | |
# ============================================================================= | |
ScrubTweets = function(txt) { | |
# Scrubs tweets for NLP analysis. | |
# Arguments: "txt" the texts of the tweets (character vector) | |
# Returns: the scrubbed tweet texts (character vector) | |
x = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", txt) # purge re-tweets | |
x = gsub("@\\w+", "", x) # purge @... | |
x = gsub("http\\w+", "", x) # purge http links | |
x = tolower(x) # make lower case | |
x = removeNumbers(x) | |
x = removePunctuation(x) | |
stripWhitespace(x) | |
} | |
tweets[ , "txt"] = ScrubTweets(tweets[ , "txt"]) | |
print("Tweets scrubbed [OK]") | |
# CLASSIFY TEXT BASED ON EMOTION AND POLARITY | |
# ============================================================================= | |
ClassifyEmoPol = function(txt) { | |
# Clasifies a character vector both in terms of emotion categories and | |
# also polarity. | |
# Arguments: "txt" a vector with the texts to classify (character) | |
# Returns: three columns, text, emotion, polarity (data frame) | |
cbind( | |
emotion = classify_emotion(txt, algorithm="bayes", prior=1.0)[,"BEST_FIT"], | |
polarity = classify_polarity(txt, algorithm="bayes")[,"BEST_FIT"] | |
) | |
} | |
# Column-bind classifications to the tweets matrix. | |
tweets = cbind(tweets, ClassifyEmoPol(tweets)) | |
print("Tweets classified for emotion and polarity [OK]") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment