diamonaj · January 30, 2023 19:59
diff --git a/step2.R b/step2.R
 # how often are words (word-stems) used across all the docs
 dtm <- DocumentTermMatrix(corpus.stemmed)

 # in the first 5 text files, how frequent are the first 8 words (alphabetical order)
 inspect(dtm[1:5, 1:8])

 # let's make that dtm table a matrix...
 dtm.mat <- as.matrix(dtm)

 ####### STEP 3 ----- visualizing the high-frequency words

 library(wordcloud)
 wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12
 wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24


 # If we forget what a word stem refers to then we can find out
 stemCompletion(c("revenu", "commerc",
                 "peac", "army"), corpus.prep)


 ## Here's a way of figuring out how important a word is, in a particular doc
 dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure)

 dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix

 ## 10 most important words for paper no. 12
 head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)

 ## 10 most important words for paper no. 24
 head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)
	# how often are words (word-stems) used across all the docs
	dtm <- DocumentTermMatrix(corpus.stemmed)

	# in the first 5 text files, how frequent are the first 8 words (alphabetical order)
	inspect(dtm[1:5, 1:8])

	# let's make that dtm table a matrix...
	dtm.mat <- as.matrix(dtm)

	####### STEP 3 ----- visualizing the high-frequency words

	library(wordcloud)
	wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12
	wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24


	# If we forget what a word stem refers to then we can find out
	stemCompletion(c("revenu", "commerc",
	"peac", "army"), corpus.prep)


	## Here's a way of figuring out how important a word is, in a particular doc
	dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure)

	dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix

	## 10 most important words for paper no. 12
	head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)

	## 10 most important words for paper no. 24
	head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)