Created
January 30, 2023 19:59
-
-
Save diamonaj/c31ccb85922d3789406913771962564d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# how often are words (word-stems) used across all the docs | |
dtm <- DocumentTermMatrix(corpus.stemmed) | |
# in the first 5 text files, how frequent are the first 8 words (alphabetical order) | |
inspect(dtm[1:5, 1:8]) | |
# let's make that dtm table a matrix... | |
dtm.mat <- as.matrix(dtm) | |
####### STEP 3 ----- visualizing the high-frequency words | |
library(wordcloud) | |
wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12 | |
wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24 | |
# If we forget what a word stem refers to then we can find out | |
stemCompletion(c("revenu", "commerc", | |
"peac", "army"), corpus.prep) | |
## Here's a way of figuring out how important a word is, in a particular doc | |
dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure) | |
dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix | |
## 10 most important words for paper no. 12 | |
head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10) | |
## 10 most important words for paper no. 24 | |
head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment