Last active
August 29, 2015 14:08
-
-
Save Inpirical-Coder/526fb5e73455e486223a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a simple example for acquiring text-stream assets using the R "tm" package | |
require(tm) # Load the text-mining package. | |
require(tm.plugin.webmining) # Web-mining plugin for text mining. | |
require(SnowballC) # Package for stemming. | |
# Define the symbol we want to acquire news on. | |
sym = "NYSE:HSBC" | |
# Build a corpus of the news items. | |
corpus = WebCorpus(GoogleFinanceSource(sym)) | |
# PROCESSING THE CORPUS | |
# Remove punctuation, (but preserve dashes inside words) | |
corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE) | |
# All text to lower case | |
corpus = tm_map(corpus, content_transformer(tolower)) | |
# Remove stopwords given English language | |
corpus = tm_map(corpus, removeWords, stopwords("english")) | |
# Remove all numbers form the corpus | |
corpus = tm_map(corpus, removeNumbers) | |
# Strip white-space | |
corpus = tm_map(corpus, stripWhitespace) | |
# Optional step - remove particular words assumed to be analytically uninteresing: | |
#corpus = tm_map(corpus, removeWords, c("hsbc", "plc", "inc")) | |
# Stemming the corpus | |
# corpus = tm_map(corpus, stemDocument) | |
# Create a "Document-Term Matrix" from the corpus | |
dtm = DocumentTermMatrix(corpus) | |
# Deal with sparsity (example below has 66% sparsity threshold. | |
#dtm = removeSparseTerms(dtm, 0.66) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment