Last active
February 15, 2018 04:50
-
-
Save SymbolixAU/b3aa62b4cab9b18b0b476a729a3fbaa7 to your computer and use it in GitHub Desktop.
debugging the `tm::stemCompletion` function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Identifying the differences between using a Corpus dictionary, and a vector of strings. | |
## --------- | |
## Using a Corpus dictionary | |
## and type == "prevalent" | |
tm::stemCompletion(x = stemDocument(w), | |
dictionary = Corpus(x = VectorSource(x = w)), | |
type = "prevalent") | |
# intersect intersect intersect intersect | |
# "intersecting" "intersecting" "intersecting" "intersecting" | |
## --------- | |
## Using a vector dictionary | |
## and type == "prevalent" | |
w <- c("intersection", "intersecting", "intersection", "intersects") | |
tm::stemCompletion(x = stemDocument(w), | |
dictionary = w, | |
type = "prevalent") | |
# intersect intersect intersect intersect | |
# "intersection" "intersection" "intersection" "intersection" | |
## ---------- | |
## Extracting the specific code from tm::stemCompletion to debug | |
## --------- | |
## Using a Corpus dictionary | |
## and type == "prevalent" | |
w <- c("intersection", "intersecting", "intersection", "intersects") | |
dictionary <- Corpus(x = VectorSource(x = w)) | |
x <- stemDocument(x = w, language = "english") | |
## the following lines are taken from the `tm::stemCompletion` function | |
dictionary <- unique(unlist(lapply(dictionary, words))) | |
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE)) | |
possibleCompletions <- lapply(possibleCompletions, function(x) { | |
sort(table(x), decreasing = TRUE) | |
}) | |
possibleCompletions | |
n <- names(sapply(possibleCompletions, "[", 1)) | |
setNames(if (length(n)) n else rep_len(NA, length(x)),x) | |
# intersect intersect intersect intersect | |
# "intersecting" "intersecting" "intersecting" "intersecting" | |
## --------- | |
## Using a vector dictionary | |
## and type == "prevalent" | |
w <- c("intersection", "intersecting", "intersection", "intersects") | |
dictionary <- w | |
x <- stemDocument(x = w, language = "english") | |
## the following lines are taken from the `tm::stemCompletion` function | |
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE)) | |
possibleCompletions <- lapply(possibleCompletions, function(x) { | |
sort(table(x), decreasing = TRUE) | |
}) | |
possibleCompletions | |
n <- names(sapply(possibleCompletions, "[", 1)) | |
setNames(if (length(n)) n else rep_len(NA, length(x)),x) | |
# intersect intersect intersect intersect | |
# "intersection" "intersection" "intersection" "intersection" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment