Inpirical-Coder · August 29, 2015 14:08
diff --git a/R_tm_example b/R_tm_example
 # This is a simple example for acquiring text-stream assets using the R "tm" package

 require(tm)                     # Load the text-mining package.
 require(tm.plugin.webmining)    # Web-mining plugin for text mining.
 require(SnowballC)              # Package for stemming.

 # Define the symbol we want to acquire news on.
 sym = "NYSE:HSBC"

 # Build a corpus of the news items.
 corpus = WebCorpus(GoogleFinanceSource(sym))

 # PROCESSING THE CORPUS

 # Remove punctuation, (but preserve dashes inside words)
 corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)

 # All text to lower case
 corpus = tm_map(corpus, content_transformer(tolower))

 # Remove stopwords given English language
 corpus = tm_map(corpus, removeWords, stopwords("english"))

 # Remove all numbers form the corpus
 corpus = tm_map(corpus, removeNumbers)

 # Strip white-space
 corpus = tm_map(corpus, stripWhitespace)

 # Optional step - remove particular words assumed to be analytically uninteresing:
 #corpus = tm_map(corpus, removeWords, c("hsbc", "plc", "inc"))


 # Stemming the corpus
 # corpus = tm_map(corpus, stemDocument)


 # Create a "Document-Term Matrix" from the corpus
 dtm = DocumentTermMatrix(corpus)

 # Deal with sparsity (example below has 66% sparsity threshold.
 #dtm = removeSparseTerms(dtm, 0.66)
	# This is a simple example for acquiring text-stream assets using the R "tm" package

	require(tm) # Load the text-mining package.
	require(tm.plugin.webmining) # Web-mining plugin for text mining.
	require(SnowballC) # Package for stemming.

	# Define the symbol we want to acquire news on.
	sym = "NYSE:HSBC"

	# Build a corpus of the news items.
	corpus = WebCorpus(GoogleFinanceSource(sym))

	# PROCESSING THE CORPUS

	# Remove punctuation, (but preserve dashes inside words)
	corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)

	# All text to lower case
	corpus = tm_map(corpus, content_transformer(tolower))

	# Remove stopwords given English language
	corpus = tm_map(corpus, removeWords, stopwords("english"))

	# Remove all numbers form the corpus
	corpus = tm_map(corpus, removeNumbers)

	# Strip white-space
	corpus = tm_map(corpus, stripWhitespace)

	# Optional step - remove particular words assumed to be analytically uninteresing:
	#corpus = tm_map(corpus, removeWords, c("hsbc", "plc", "inc"))


	# Stemming the corpus
	# corpus = tm_map(corpus, stemDocument)


	# Create a "Document-Term Matrix" from the corpus
	dtm = DocumentTermMatrix(corpus)

	# Deal with sparsity (example below has 66% sparsity threshold.
	#dtm = removeSparseTerms(dtm, 0.66)