Skip to content

Instantly share code, notes, and snippets.

@jackwasey
Last active May 25, 2018 20:28
Show Gist options
  • Save jackwasey/59848d84728c0f55ef11 to your computer and use it in GitHub Desktop.
Save jackwasey/59848d84728c0f55ef11 to your computer and use it in GitHub Desktop.
benchmark fastmatch for string comparison in R
library(fastmatch)
library(microbenchmark)
WORDS <- read.table("https://dotnetperls-controls.googlecode.com/files/enable1.txt",
stringsAsFactors = FALSE)[[1]]
words_factor <- as.factor(WORDS)
# generate 100 sentences of between 5 and 15 words:
SENTENCES <- lapply(c(1:100), sample, x = WORDS, size = sample(c(5:15), size = 1))
bench_fun <- function(fun)
lapply(SENTENCES, fun)
# poster's slow solution:
hg_convert <- function(sentence){
return(which(WORDS %in% sentence))
}
jw_convert_match <- function(sentence) {
match(sentence, WORDS)
}
jw_convert_match_factor <- function(sentence) {
match(sentence, words_factor)
}
jw_convert_fastmatch <- function(sentence) {
fmatch(sentence, WORDS)
}
jw_convert_fastmatch_factor <- function(sentence) {
fmatch(sentence, words_factor)
}
message("starting benchmark one")
print(microbenchmark(bench_fun(hg_convert),
bench_fun(jw_convert_match),
bench_fun(jw_convert_match_factor),
bench_fun(jw_convert_fastmatch),
bench_fun(jw_convert_fastmatch_factor),
times = 10))
# now again with big samples
# generating the SENTENCES is quite slow...
SENTENCES <- lapply(c(1:1e6), sample, x = WORDS, size = sample(c(5:15), size = 1))
message("starting benchmark two, compare with factor vs vector of words")
print(microbenchmark(bench_fun(jw_convert_fastmatch),
bench_fun(jw_convert_fastmatch_factor),
times = 10))
@EngrStudent
Copy link

the google code link is broken.

@cormac85
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment