Inpirical-Coder · November 16, 2014 01:22
diff --git a/twitter.R b/twitter.R
 # Simple script for doing some data-analysis of tweets;
 # looking at "sentiment" and "emotion" using the sentiment package.
 # see https://sites.google.com/site/miningtwitter/questions/sentiment/sentiment
 # for background.

 # SETTINGS
 # =============================================================================
 authenticated    = TRUE # If TRUE will load credential from file.
 tweets.from.file = TRUE # If TRUE will load tweets from file rather than query.
 no.tweets        = 1500    # Number of tweets to fetch in every search; <= 1,500.

 # Define the list of terms to query twitter about.
 tweet.terms = c(
  "deutschebank",
  "goldmansachs",
  "jpmorgan",
  "ubs",
  "creditsuisse",
  "wellsfargo",
  "hsbc",
  "pimco",
  "moodysratings",
  "fitchratings",
  "aiginsurance",
  "fanniemae"
 )

 language = "en" # Define the language you want tweets in.

 # DEPENDENCIES (Packages and source files)
 # =============================================================================

 # Load your Twitter API keys; needed for authentication. Must define two variables:
 # "consumer.key" and "consumer.secret".
 source("twitter_api_keys.R")

 InstallArchives = function() {
  # To install the "sentiment" package and also its "Rstem" dependency.
  # Neither package is on current CRAN, therefore download from the archives.

  # URL of the CRAN repo
  repo.url = "http://cran.r-project.org/src/contrib/Archive/"

  # URL tails of the packages we want to install
  pack.urls = c(
    "Rstem/Rstem_0.4-1.tar.gz",
    "sentiment/sentiment_0.2.tar.gz"
  )

  # Install the packages.
  lapply(pack.urls, function(pack.url) {
    install.packages(paste0(repo.url, pack.url), repos=NULL)
  })
 }

 # Install the Rstem and sentiment packages if not installed.
 if(!("sentiment" %in% installed.packages())) {InstallArchives()}

 # Define the dependency packages we need.
 required.packs = c("twitteR",
  "sentiment",          # Sentiment analysis.
  "tm",                 # Text mining.
  "plyr",               # Splitting, plotting, combining data.
  "ggplot2",            # Plotting.
  "wordcloud",          # Create wordclouds.
  "data.table",         # Data tables.
  "RColorBrewer"        # Palettes for visualisation.
 )

 # Install the required packages if missing, then load them.
 sapply(required.packs, function(pack) {
  if(!(pack %in% installed.packages())) {install.packages(pack)}
  require(pack, character.only=TRUE)
 })

 print("Dependencies met [OK]")


 # AUTHENTICATE
 # =============================================================================

 TwitterAuth = function() {
  # Function to authenticate with Twitter API.

  # URLs needed for authentication.
  request.url   = "https://api.twitter.com/oauth/request_token"
  access.url    = "https://api.twitter.com/oauth/access_token"
  auth.url      = "https://api.twitter.com/oauth/authorize"

  # Create a twitter credential.
  twit.cred = OAuthFactory$new(
    consumerKey =consumer.key,
    consumerSecret=consumer.secret,
    requestURL=request.url,
    accessURL=access.url,
    authURL=auth.url
  )

  twit.cred$handshake()
  save(twit.cred, file="twit_cred.Rdat")
  twit.cred
 }

 # If you have already authenticated before, just load the saved credential.
 if(authenticated) {
  load("twit_cred.Rdat")
 } else {
  twit.cred = TwitterAuth()
 }

 registerTwitterOAuth(twit.cred)

 print("Authenticated with Twitter for use of API [OK]")

 # HARVEST TWEETS
 # =============================================================================

 HarvestTweets = function(tweet.terms) {
  tweets = lapply(tweet.terms, function(i) {
    print(paste("Getting tweets for", i))
    x = tryCatch(searchTwitter(i, n=no.tweets), error=function(e) NULL)
    x = sapply(x, "[[", "text")
    cbind(txt = x, term = i)
  })

  # Bind all the tweets into one character matrix and purge duplicates.
  unique(Reduce(rbind, tweets[sapply(tweets, nrow) > 1]))
 }

 # If you have already authenticated before, just load the saved credential.
 if(tweets.from.file)    {
  load("data/tweets.Rdat")
  print("Tweets loaded from file [OK]")
 } else {
  tweets = HarvestTweets(tweet.terms)
  save(tweets, file="data/tweets.Rdat")
  print("Tweets harvested and saved [OK]")
 }
 # SCRUB TWEETS
 # =============================================================================

 ScrubTweets = function(txt) {
  # Scrubs tweets for NLP analysis.
  # Arguments: "txt" the texts of the tweets (character vector)
  # Returns: the scrubbed tweet texts (character vector)

  x = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "",   txt) # purge re-tweets
  x = gsub("@\\w+", "",                         x)   # purge @... 
  x = gsub("http\\w+", "",                      x)   # purge http links
  x = tolower(x)                                     # make lower case
  x = removeNumbers(x)
  x = removePunctuation(x)
  stripWhitespace(x)
 }

 tweets[ , "txt"] = ScrubTweets(tweets[ , "txt"])
 print("Tweets scrubbed [OK]")


 # CLASSIFY TEXT BASED ON EMOTION AND POLARITY
 # =============================================================================

 ClassifyEmoPol = function(txt) {
  # Clasifies a character vector both in terms of emotion categories and
  # also polarity.

  # Arguments: "txt" a vector with the texts to classify (character)
  # Returns: three columns, text, emotion, polarity (data frame)

  cbind(
    emotion = classify_emotion(txt, algorithm="bayes", prior=1.0)[,"BEST_FIT"],
    polarity = classify_polarity(txt, algorithm="bayes")[,"BEST_FIT"]
  )
 }

 # Column-bind classifications to the tweets matrix.
 tweets = cbind(tweets, ClassifyEmoPol(tweets))
 print("Tweets classified for emotion and polarity [OK]")
	# Simple script for doing some data-analysis of tweets;
	# looking at "sentiment" and "emotion" using the sentiment package.
	# see https://sites.google.com/site/miningtwitter/questions/sentiment/sentiment
	# for background.

	# SETTINGS
	# =============================================================================
	authenticated = TRUE # If TRUE will load credential from file.
	tweets.from.file = TRUE # If TRUE will load tweets from file rather than query.
	no.tweets = 1500 # Number of tweets to fetch in every search; <= 1,500.

	# Define the list of terms to query twitter about.
	tweet.terms = c(
	"deutschebank",
	"goldmansachs",
	"jpmorgan",
	"ubs",
	"creditsuisse",
	"wellsfargo",
	"hsbc",
	"pimco",
	"moodysratings",
	"fitchratings",
	"aiginsurance",
	"fanniemae"
	)

	language = "en" # Define the language you want tweets in.

	# DEPENDENCIES (Packages and source files)
	# =============================================================================

	# Load your Twitter API keys; needed for authentication. Must define two variables:
	# "consumer.key" and "consumer.secret".
	source("twitter_api_keys.R")

	InstallArchives = function() {
	# To install the "sentiment" package and also its "Rstem" dependency.
	# Neither package is on current CRAN, therefore download from the archives.

	# URL of the CRAN repo
	repo.url = "http://cran.r-project.org/src/contrib/Archive/"

	# URL tails of the packages we want to install
	pack.urls = c(
	"Rstem/Rstem_0.4-1.tar.gz",
	"sentiment/sentiment_0.2.tar.gz"
	)

	# Install the packages.
	lapply(pack.urls, function(pack.url) {
	install.packages(paste0(repo.url, pack.url), repos=NULL)
	})
	}

	# Install the Rstem and sentiment packages if not installed.
	if(!("sentiment" %in% installed.packages())) {InstallArchives()}

	# Define the dependency packages we need.
	required.packs = c("twitteR",
	"sentiment", # Sentiment analysis.
	"tm", # Text mining.
	"plyr", # Splitting, plotting, combining data.
	"ggplot2", # Plotting.
	"wordcloud", # Create wordclouds.
	"data.table", # Data tables.
	"RColorBrewer" # Palettes for visualisation.
	)

	# Install the required packages if missing, then load them.
	sapply(required.packs, function(pack) {
	if(!(pack %in% installed.packages())) {install.packages(pack)}
	require(pack, character.only=TRUE)
	})

	print("Dependencies met [OK]")


	# AUTHENTICATE
	# =============================================================================

	TwitterAuth = function() {
	# Function to authenticate with Twitter API.

	# URLs needed for authentication.
	request.url = "https://api.twitter.com/oauth/request_token"
	access.url = "https://api.twitter.com/oauth/access_token"
	auth.url = "https://api.twitter.com/oauth/authorize"

	# Create a twitter credential.
	twit.cred = OAuthFactory$new(
	consumerKey =consumer.key,
	consumerSecret=consumer.secret,
	requestURL=request.url,
	accessURL=access.url,
	authURL=auth.url
	)

	twit.cred$handshake()
	save(twit.cred, file="twit_cred.Rdat")
	twit.cred
	}

	# If you have already authenticated before, just load the saved credential.
	if(authenticated) {
	load("twit_cred.Rdat")
	} else {
	twit.cred = TwitterAuth()
	}

	registerTwitterOAuth(twit.cred)

	print("Authenticated with Twitter for use of API [OK]")

	# HARVEST TWEETS
	# =============================================================================

	HarvestTweets = function(tweet.terms) {
	tweets = lapply(tweet.terms, function(i) {
	print(paste("Getting tweets for", i))
	x = tryCatch(searchTwitter(i, n=no.tweets), error=function(e) NULL)
	x = sapply(x, "[[", "text")
	cbind(txt = x, term = i)
	})

	# Bind all the tweets into one character matrix and purge duplicates.
	unique(Reduce(rbind, tweets[sapply(tweets, nrow) > 1]))
	}

	# If you have already authenticated before, just load the saved credential.
	if(tweets.from.file) {
	load("data/tweets.Rdat")
	print("Tweets loaded from file [OK]")
	} else {
	tweets = HarvestTweets(tweet.terms)
	save(tweets, file="data/tweets.Rdat")
	print("Tweets harvested and saved [OK]")
	}
	# SCRUB TWEETS
	# =============================================================================

	ScrubTweets = function(txt) {
	# Scrubs tweets for NLP analysis.
	# Arguments: "txt" the texts of the tweets (character vector)
	# Returns: the scrubbed tweet texts (character vector)

	x = gsub("(RT\|via)((?:\\b\\W*@\\w+)+)", "", txt) # purge re-tweets
	x = gsub("@\\w+", "", x) # purge @...
	x = gsub("http\\w+", "", x) # purge http links
	x = tolower(x) # make lower case
	x = removeNumbers(x)
	x = removePunctuation(x)
	stripWhitespace(x)
	}

	tweets[ , "txt"] = ScrubTweets(tweets[ , "txt"])
	print("Tweets scrubbed [OK]")


	# CLASSIFY TEXT BASED ON EMOTION AND POLARITY
	# =============================================================================

	ClassifyEmoPol = function(txt) {
	# Clasifies a character vector both in terms of emotion categories and
	# also polarity.

	# Arguments: "txt" a vector with the texts to classify (character)
	# Returns: three columns, text, emotion, polarity (data frame)

	cbind(
	emotion = classify_emotion(txt, algorithm="bayes", prior=1.0)[,"BEST_FIT"],
	polarity = classify_polarity(txt, algorithm="bayes")[,"BEST_FIT"]
	)
	}

	# Column-bind classifications to the tweets matrix.
	tweets = cbind(tweets, ClassifyEmoPol(tweets))
	print("Tweets classified for emotion and polarity [OK]")