SymbolixAU · February 15, 2018 04:50
diff --git a/stemCompletion.R b/stemCompletion.R
 ## Identifying the differences between using a Corpus dictionary, and a vector of strings. 

 ## ---------
 ## Using a Corpus dictionary
 ## and type == "prevalent"

 tm::stemCompletion(x = stemDocument(w), 
 		dictionary = Corpus(x = VectorSource(x = w)), 
 		type = "prevalent")
 # intersect      intersect      intersect      intersect 
 # "intersecting" "intersecting" "intersecting" "intersecting"

 ## ---------
 ## Using a vector dictionary
 ## and type == "prevalent"
 w <- c("intersection", "intersecting", "intersection", "intersects")

 tm::stemCompletion(x = stemDocument(w), 
 		dictionary = w, 
 		type = "prevalent")
 # intersect      intersect      intersect      intersect 
 # "intersection" "intersection" "intersection" "intersection"


 ## ----------
 ## Extracting the specific code from tm::stemCompletion to debug


 ## ---------
 ## Using a Corpus dictionary
 ## and type == "prevalent"

 w <- c("intersection", "intersecting", "intersection", "intersects")
 dictionary <- Corpus(x = VectorSource(x = w))
 x <- stemDocument(x = w, language = "english")

 ## the following lines are taken from the `tm::stemCompletion` function
 dictionary <- unique(unlist(lapply(dictionary, words)))

 possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE))

 possibleCompletions <- lapply(possibleCompletions, function(x) { 
 	sort(table(x), decreasing = TRUE)
 	})
 possibleCompletions
 			      
 n <- names(sapply(possibleCompletions, "[", 1))
 setNames(if (length(n)) n else rep_len(NA, length(x)),x)
 # intersect      intersect      intersect      intersect 
 # "intersecting" "intersecting" "intersecting" "intersecting"



 ## ---------
 ## Using a vector dictionary
 ## and type == "prevalent"

 w <- c("intersection", "intersecting", "intersection", "intersects")
 dictionary <- w
 x <- stemDocument(x = w, language = "english")

 ## the following lines are taken from the `tm::stemCompletion` function
 possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE))

 possibleCompletions <- lapply(possibleCompletions, function(x) { 
 	sort(table(x), decreasing = TRUE)
 })
 possibleCompletions

 n <- names(sapply(possibleCompletions, "[", 1))
 setNames(if (length(n)) n else rep_len(NA, length(x)),x)
 # intersect      intersect      intersect      intersect 
 # "intersection" "intersection" "intersection" "intersection"
	## Identifying the differences between using a Corpus dictionary, and a vector of strings.

	## ---------
	## Using a Corpus dictionary
	## and type == "prevalent"

	tm::stemCompletion(x = stemDocument(w),
	dictionary = Corpus(x = VectorSource(x = w)),
	type = "prevalent")
	# intersect intersect intersect intersect
	# "intersecting" "intersecting" "intersecting" "intersecting"

	## ---------
	## Using a vector dictionary
	## and type == "prevalent"
	w <- c("intersection", "intersecting", "intersection", "intersects")

	tm::stemCompletion(x = stemDocument(w),
	dictionary = w,
	type = "prevalent")
	# intersect intersect intersect intersect
	# "intersection" "intersection" "intersection" "intersection"


	## ----------
	## Extracting the specific code from tm::stemCompletion to debug


	## ---------
	## Using a Corpus dictionary
	## and type == "prevalent"

	w <- c("intersection", "intersecting", "intersection", "intersects")
	dictionary <- Corpus(x = VectorSource(x = w))
	x <- stemDocument(x = w, language = "english")

	## the following lines are taken from the `tm::stemCompletion` function
	dictionary <- unique(unlist(lapply(dictionary, words)))

	possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE))

	possibleCompletions <- lapply(possibleCompletions, function(x) {
	sort(table(x), decreasing = TRUE)
	})
	possibleCompletions

	n <- names(sapply(possibleCompletions, "[", 1))
	setNames(if (length(n)) n else rep_len(NA, length(x)),x)
	# intersect intersect intersect intersect
	# "intersecting" "intersecting" "intersecting" "intersecting"



	## ---------
	## Using a vector dictionary
	## and type == "prevalent"

	w <- c("intersection", "intersecting", "intersection", "intersects")
	dictionary <- w
	x <- stemDocument(x = w, language = "english")

	## the following lines are taken from the `tm::stemCompletion` function
	possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE))

	possibleCompletions <- lapply(possibleCompletions, function(x) {
	sort(table(x), decreasing = TRUE)
	})
	possibleCompletions

	n <- names(sapply(possibleCompletions, "[", 1))
	setNames(if (length(n)) n else rep_len(NA, length(x)),x)
	# intersect intersect intersect intersect
	# "intersection" "intersection" "intersection" "intersection"