Last active
August 25, 2024 12:02
-
-
Save Inpirical-Coder/bc05cd137125d3a77b9a to your computer and use it in GitHub Desktop.
R Sentiment Scoring HSBC w/ Harvard General Inquirer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code to fetch news streams from 5 live sources, process the streams and text | |
# and apply a simple sentiment scoring algorigthm. | |
# | |
# A writeup of the analysis can be found here: | |
# https://www.linkedin.com/pulse/article/20141109035942-34768479-r-sentiment-scoring-hsbc-w-harvard-general-inquirer | |
# Define the packages we want to load: | |
packs = c( | |
"tm", # Text mining | |
"tm.plugin.webmining", # Web-source plugin for text mining | |
"SnowballC", # Stemmer | |
"RColorBrewer", # Colors for visualisation | |
"ggplot2", # Plotting | |
"wordcloud", # Draw wordclouds | |
"openNLP" # Split text into sentences. | |
) | |
sapply(packs, require, character.only=TRUE) # Load the packages. | |
# Download the corpora and insert them in a named list. | |
corpora = list( | |
googlefinance = WebCorpus(GoogleFinanceSource("NYSE:HSBC")), | |
googlenews = WebCorpus(GoogleNewsSource("HSBC")), | |
yahoofinance = WebCorpus(YahooFinanceSource("HSBC")), | |
yahooinplay = WebCorpus(YahooInplaySource()), | |
yahoonews = WebCorpus(YahooNewsSource("HSBC")) | |
) | |
# Save the corpora list. | |
save(corpora, file="data/corpora.Rdat") | |
#load('data/corpora.Rdat') | |
# Break the corpora down into sentences and define some functions to do so. | |
ToSentences = function(text, language="en") { | |
# Splits text into sentences using an Apache OpenNLP sentence detector. | |
# Arguments: | |
# "text" the text to be processed (character) | |
# "lang" ISO-639 code of the language of the text (character) | |
# Returns: | |
# sentences of the text (character vector) | |
if(length(text) ==0) {return("")} | |
if(nchar(text) == 0) {return("")} # Cover special case 0-character text. | |
# Convert text to String object; allows for splitting by index. | |
text = as.String(text) | |
# Discover the sentence markers in the text (specify NLP as | |
# source of annotate because there is also an annotate function in ggplot2) | |
markers = NLP::annotate( | |
text, | |
Maxent_Sent_Token_Annotator(language=language) # Annotator from OpenNLP | |
) | |
# Return sentences by splitting the text at the boundaries. | |
text[markers] | |
} | |
CorpusToSentences = function(corpus) { | |
# Split every document in the corpus into sentences and return a new corpus | |
# with all the sentences as individual documents. | |
# Extract the text from each document in the corpus. | |
text = lapply(corpus, "[[", "content") | |
# Basically convert the text | |
docs = lapply(text, ToSentences) | |
docs = as.vector(unlist(docs)) | |
# Return a corpus with sentences as documents. | |
Corpus(VectorSource(docs)) | |
} | |
# Create a new corpus which merges existing corpora after splitting them | |
# into sentences. | |
corpus = Reduce(c, lapply(corpora, CorpusToSentences)) | |
# Process the corpora contents. | |
corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE) | |
corpus = tm_map(corpus, content_transformer(tolower)) | |
corpus = tm_map(corpus, removeWords, stopwords("english")) | |
corpus = tm_map(corpus, removeNumbers) | |
corpus = tm_map(corpus, stripWhitespace) | |
#toString = content_transformer(function(x, from, to) gsub(from, to, x)) | |
#corpus = tm_map(corpus, toString, "hsbc", "hsbc") | |
# Stemming | |
# corpus = tm_map(corpus, stemDocument) | |
# Create a document term matrix from the corpus. | |
dtm = DocumentTermMatrix(corpus) | |
# Subset the DTM to include only documents including the term "hsbc". | |
dtm = dtm[rowSums(as.matrix(dtm[ , "hsbc"])) > 0, ] | |
# Remove terms which are not contained in any of the documents. | |
dtm = dtm[ , colSums(as.matrix(dtm)) > 0] | |
# ACQUIRING AND PROCESSING THE LEXICON. | |
# Load the sentiment lexicon (saved down in working directory as a | |
# comma separated value file). | |
lex = read.csv("inquirerbasic.csv", stringsAsFactors=FALSE) | |
# Collapse words with multiple entries into one entry. These are marked | |
# with a trailing #1, #2, ... | |
# Remove #1 tags. | |
lex$Entry = gsub("#1", "", lex$Entry) | |
# Remove entries that are still numbered (i.e. two or higher) | |
lex = lex[!grepl("#", lex$Entry), ] | |
# Extract the positive and negative words from the lexicon. | |
neg.lex = tolower(lex$Entry[lex$Negativ != ""]) | |
pos.lex = tolower(lex$Entry[lex$Positiv != ""]) | |
terms = colnames(dtm) | |
# Find the positive and negative terms using the lexicons. | |
neg.terms = terms[terms %in% neg.lex] | |
pos.terms = terms[terms %in% pos.lex] | |
# Specify positive terms which may be quiestionable. | |
pos.terms.adj = setdiff(pos.terms, c("equity", "share", "consensus")) | |
# Calculate the negative and positive sentence scores ("document scores"). | |
neg.scores = rowSums(as.matrix(dtm[ , neg.terms])) | |
pos.scores = rowSums(as.matrix(dtm[ , pos.terms])) | |
document.scores = pos.scores - neg.scores | |
# Calulate the document signs ("sentence signs"). | |
document.signs = sign(document.scores) | |
# Calculate the sentiment score | |
sentiment.score = sum(document.signs == 1) / sum(document.signs !=0) | |
## Visualisation: | |
# Generate word clouds (positive and negative). | |
PosCloud = function() { | |
wordcloud( | |
pos.terms, | |
colSums(as.matrix(dtm[ , pos.terms])), | |
min.freq=1, | |
scale=c(4,0.7), | |
color=brewer.pal(n=9, "Blues")[6:9] | |
) | |
} | |
NegCloud = function() { | |
wordcloud( | |
neg.terms, | |
colSums(as.matrix(dtm[ , neg.terms])), | |
min.freq=1, | |
scale=c(4,0.7), | |
color=brewer.pal(n=9, "Reds")[6:9] | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
where did you get inquirerbasic from?