Inpirical-Coder · August 25, 2024 12:02 · yash-tekriwal · Jun 5, 2016 · rtkaleta · Jan 3, 2017
diff --git a/sentiment_score_simple.R b/sentiment_score_simple.R
 # Code to fetch news streams from 5 live sources, process the streams and text
 # and apply a simple sentiment scoring algorigthm.
 #
 # A writeup of the analysis can be found here:
 # https://www.linkedin.com/pulse/article/20141109035942-34768479-r-sentiment-scoring-hsbc-w-harvard-general-inquirer

 # Define the packages we want to load:
 packs = c(
  "tm",                         # Text mining
  "tm.plugin.webmining",        # Web-source plugin for text mining
  "SnowballC",                  # Stemmer
  "RColorBrewer",               # Colors for visualisation
  "ggplot2",                    # Plotting
  "wordcloud",                  # Draw wordclouds
  "openNLP"                     # Split text into sentences.
 )

 sapply(packs, require, character.only=TRUE)   # Load the packages.

 # Download the corpora and insert them in a named list.
 corpora = list(
  googlefinance = WebCorpus(GoogleFinanceSource("NYSE:HSBC")),
  googlenews    = WebCorpus(GoogleNewsSource("HSBC")),
  yahoofinance  = WebCorpus(YahooFinanceSource("HSBC")),
  yahooinplay   = WebCorpus(YahooInplaySource()),
  yahoonews     = WebCorpus(YahooNewsSource("HSBC"))
 )

 # Save the corpora list.
 save(corpora, file="data/corpora.Rdat")


 #load('data/corpora.Rdat')



 # Break the corpora down into sentences and define some functions to do so.

 ToSentences = function(text, language="en") {
  # Splits text into sentences using an Apache OpenNLP sentence detector.

  # Arguments:
  # "text" the text to be processed (character)
  # "lang" ISO-639 code of the language of the text (character)

  # Returns:
  # sentences of the text (character vector)
  if(length(text) ==0)      {return("")}
  if(nchar(text) == 0)   {return("")}   # Cover special case 0-character text.
 
  # Convert text to String object; allows for splitting by index.
  text = as.String(text)

  # Discover the sentence markers in the text (specify NLP as
  # source of annotate because there is also an annotate function in ggplot2)
  markers = NLP::annotate(
    text,
    Maxent_Sent_Token_Annotator(language=language)   # Annotator from OpenNLP
  )

  # Return sentences by splitting the text at the boundaries.
  text[markers]
 }


 CorpusToSentences = function(corpus) {
  # Split every document in the corpus into sentences and return a new corpus
  # with all the sentences as individual documents.

  # Extract the text from each document in the corpus.
  text = lapply(corpus, "[[", "content")

  # Basically convert the text
  docs = lapply(text, ToSentences)

  docs = as.vector(unlist(docs))

  # Return a corpus with sentences as documents.
  Corpus(VectorSource(docs))
 }


 # Create a new corpus which merges existing corpora after splitting them
 # into sentences.
 corpus = Reduce(c, lapply(corpora, CorpusToSentences))

 # Process the corpora contents.
 corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
 corpus = tm_map(corpus, content_transformer(tolower))
 corpus = tm_map(corpus, removeWords, stopwords("english"))
 corpus = tm_map(corpus, removeNumbers)
 corpus = tm_map(corpus, stripWhitespace)

 #toString = content_transformer(function(x, from, to) gsub(from, to, x))
 #corpus = tm_map(corpus, toString, "hsbc", "hsbc")


 # Stemming
 # corpus = tm_map(corpus, stemDocument)


 # Create a document term matrix from the corpus.
 dtm = DocumentTermMatrix(corpus)

 # Subset the DTM to include only documents including the term "hsbc".
 dtm = dtm[rowSums(as.matrix(dtm[ , "hsbc"])) > 0, ]

 # Remove terms which are not contained in any of the documents.
 dtm = dtm[ , colSums(as.matrix(dtm)) > 0]


 # ACQUIRING AND PROCESSING THE LEXICON.

 # Load the sentiment lexicon (saved down in working directory as a
 # comma separated value file).
 lex = read.csv("inquirerbasic.csv", stringsAsFactors=FALSE)

 # Collapse words with multiple entries into one entry. These are marked
 # with a trailing #1, #2, ...

 # Remove #1 tags. 
 lex$Entry = gsub("#1", "", lex$Entry)

 # Remove entries that are still numbered (i.e. two or higher)
 lex = lex[!grepl("#", lex$Entry), ]

 # Extract the positive and negative words from the lexicon.
 neg.lex = tolower(lex$Entry[lex$Negativ != ""])
 pos.lex = tolower(lex$Entry[lex$Positiv != ""])

 terms = colnames(dtm)

 # Find the positive and negative terms using the lexicons.
 neg.terms = terms[terms %in% neg.lex]
 pos.terms = terms[terms %in% pos.lex]

 # Specify positive terms which may be quiestionable.
 pos.terms.adj = setdiff(pos.terms, c("equity", "share", "consensus"))

 # Calculate the negative and positive sentence scores ("document scores").
 neg.scores = rowSums(as.matrix(dtm[ , neg.terms]))
 pos.scores = rowSums(as.matrix(dtm[ , pos.terms]))

 document.scores = pos.scores - neg.scores

 # Calulate the document signs ("sentence signs").
 document.signs = sign(document.scores)

 # Calculate the sentiment score
 sentiment.score = sum(document.signs == 1) / sum(document.signs !=0)


 ## Visualisation:


 # Generate word clouds (positive and negative).
 PosCloud = function() {
  wordcloud(
    pos.terms,
    colSums(as.matrix(dtm[ , pos.terms])),
    min.freq=1,
    scale=c(4,0.7),
    color=brewer.pal(n=9, "Blues")[6:9]
  )
 }

 NegCloud = function() {
  wordcloud(
    neg.terms,
    colSums(as.matrix(dtm[ , neg.terms])),
    min.freq=1,
    scale=c(4,0.7),
    color=brewer.pal(n=9, "Reds")[6:9]
  )
 }
	# Code to fetch news streams from 5 live sources, process the streams and text
	# and apply a simple sentiment scoring algorigthm.
	#
	# A writeup of the analysis can be found here:
	# https://www.linkedin.com/pulse/article/20141109035942-34768479-r-sentiment-scoring-hsbc-w-harvard-general-inquirer

	# Define the packages we want to load:
	packs = c(
	"tm", # Text mining
	"tm.plugin.webmining", # Web-source plugin for text mining
	"SnowballC", # Stemmer
	"RColorBrewer", # Colors for visualisation
	"ggplot2", # Plotting
	"wordcloud", # Draw wordclouds
	"openNLP" # Split text into sentences.
	)

	sapply(packs, require, character.only=TRUE) # Load the packages.

	# Download the corpora and insert them in a named list.
	corpora = list(
	googlefinance = WebCorpus(GoogleFinanceSource("NYSE:HSBC")),
	googlenews = WebCorpus(GoogleNewsSource("HSBC")),
	yahoofinance = WebCorpus(YahooFinanceSource("HSBC")),
	yahooinplay = WebCorpus(YahooInplaySource()),
	yahoonews = WebCorpus(YahooNewsSource("HSBC"))
	)

	# Save the corpora list.
	save(corpora, file="data/corpora.Rdat")


	#load('data/corpora.Rdat')



	# Break the corpora down into sentences and define some functions to do so.

	ToSentences = function(text, language="en") {
	# Splits text into sentences using an Apache OpenNLP sentence detector.

	# Arguments:
	# "text" the text to be processed (character)
	# "lang" ISO-639 code of the language of the text (character)

	# Returns:
	# sentences of the text (character vector)
	if(length(text) ==0) {return("")}
	if(nchar(text) == 0) {return("")} # Cover special case 0-character text.

	# Convert text to String object; allows for splitting by index.
	text = as.String(text)

	# Discover the sentence markers in the text (specify NLP as
	# source of annotate because there is also an annotate function in ggplot2)
	markers = NLP::annotate(
	text,
	Maxent_Sent_Token_Annotator(language=language) # Annotator from OpenNLP
	)

	# Return sentences by splitting the text at the boundaries.
	text[markers]
	}


	CorpusToSentences = function(corpus) {
	# Split every document in the corpus into sentences and return a new corpus
	# with all the sentences as individual documents.

	# Extract the text from each document in the corpus.
	text = lapply(corpus, "[[", "content")

	# Basically convert the text
	docs = lapply(text, ToSentences)

	docs = as.vector(unlist(docs))

	# Return a corpus with sentences as documents.
	Corpus(VectorSource(docs))
	}


	# Create a new corpus which merges existing corpora after splitting them
	# into sentences.
	corpus = Reduce(c, lapply(corpora, CorpusToSentences))

	# Process the corpora contents.
	corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
	corpus = tm_map(corpus, content_transformer(tolower))
	corpus = tm_map(corpus, removeWords, stopwords("english"))
	corpus = tm_map(corpus, removeNumbers)
	corpus = tm_map(corpus, stripWhitespace)

	#toString = content_transformer(function(x, from, to) gsub(from, to, x))
	#corpus = tm_map(corpus, toString, "hsbc", "hsbc")


	# Stemming
	# corpus = tm_map(corpus, stemDocument)


	# Create a document term matrix from the corpus.
	dtm = DocumentTermMatrix(corpus)

	# Subset the DTM to include only documents including the term "hsbc".
	dtm = dtm[rowSums(as.matrix(dtm[ , "hsbc"])) > 0, ]

	# Remove terms which are not contained in any of the documents.
	dtm = dtm[ , colSums(as.matrix(dtm)) > 0]


	# ACQUIRING AND PROCESSING THE LEXICON.

	# Load the sentiment lexicon (saved down in working directory as a
	# comma separated value file).
	lex = read.csv("inquirerbasic.csv", stringsAsFactors=FALSE)

	# Collapse words with multiple entries into one entry. These are marked
	# with a trailing #1, #2, ...

	# Remove #1 tags.
	lex$Entry = gsub("#1", "", lex$Entry)

	# Remove entries that are still numbered (i.e. two or higher)
	lex = lex[!grepl("#", lex$Entry), ]

	# Extract the positive and negative words from the lexicon.
	neg.lex = tolower(lex$Entry[lex$Negativ != ""])
	pos.lex = tolower(lex$Entry[lex$Positiv != ""])

	terms = colnames(dtm)

	# Find the positive and negative terms using the lexicons.
	neg.terms = terms[terms %in% neg.lex]
	pos.terms = terms[terms %in% pos.lex]

	# Specify positive terms which may be quiestionable.
	pos.terms.adj = setdiff(pos.terms, c("equity", "share", "consensus"))

	# Calculate the negative and positive sentence scores ("document scores").
	neg.scores = rowSums(as.matrix(dtm[ , neg.terms]))
	pos.scores = rowSums(as.matrix(dtm[ , pos.terms]))

	document.scores = pos.scores - neg.scores

	# Calulate the document signs ("sentence signs").
	document.signs = sign(document.scores)

	# Calculate the sentiment score
	sentiment.score = sum(document.signs == 1) / sum(document.signs !=0)


	## Visualisation:


	# Generate word clouds (positive and negative).
	PosCloud = function() {
	wordcloud(
	pos.terms,
	colSums(as.matrix(dtm[ , pos.terms])),
	min.freq=1,
	scale=c(4,0.7),
	color=brewer.pal(n=9, "Blues")[6:9]
	)
	}

	NegCloud = function() {
	wordcloud(
	neg.terms,
	colSums(as.matrix(dtm[ , neg.terms])),
	min.freq=1,
	scale=c(4,0.7),
	color=brewer.pal(n=9, "Reds")[6:9]
	)
	}