hadley · July 10, 2012 19:28
diff --git a/tm.r b/tm.r

 library(stringr)
 library(tm)

 if (file.exists("english.rds")) {
  data <- readRDS("english.rds")  
 } else {
  data <- read.csv("english_text.csv.gz", stringsAsFactors = FALSE)
  saveRDS(data, "english.rds")
 }
 text <- data$text

 # Encoding -------------------------------------------------------------------

 # Figure out which encoding is used in the original data
 n_failed <- function(x, enc) {
  reenc <- iconv(x, enc, "UTF-8")
  sum(is.na(reenc))
 }
 failures <- sapply(iconvlist(), n_failed, x = text[1:100])
 ok <- iconvlist()[failures == 0]

 conv <- vapply(ok, iconv, x = text[9], to = "UTF-8", character(1))
 str_sub(conv, 200, 240)

 conv <- vapply(ok, iconv, x = text[15], to = "UTF-8", character(1))
 str_sub(conv, -300, -200)
 # Doesn't seem to be consistently encoded

 # Just ram into ASCII
 system.time(text2 <- iconv(text, "UTF-8", "ASCII//TRANSLIT", "??"))

 # Strip whitespace, punctuation and stopwords --------------------------------

 # Which is fastest?
 strip_space <- function(x, ...) {
  gsub("([[:punct:]]|[[:space:]])+", " ", x, ...)
 }
 system.time(text3 <- strip_space(text2[1:1000]))
 system.time(text3 <- strip_space(text2[1:1000], perl = TRUE))
 system.time(text3 <- strip_space(text2[1:1000], useBytes = TRUE))
 system.time(text3 <- strip_space(text2[1:1000], perl = TRUE, useBytes = TRUE))

 # Perl is fastest

 system.time({
  text3a <- gsub("[[:punct:]]+", "", text2, perl = TRUE)
  text3 <-  gsub("[[:space:]]+", " ", text3a, perl = TRUE)
 })
 # Takes about 9 seconds all up

 system.time(text4 <- tolower(text3))
 # Takes another 2 seconds

 # Remove stopwords
 match <- sprintf("\\b(%s)\\b", paste(stopwords("english"), collapse = "|"))

 # Explore speed again - I don't know why using bytes is now important.
 # It suggests that R doesn't realise that this isn't standard ascii text
 # But the results of Encoding are the same
 system.time(gsub(match, "", text4[1:100]))
 system.time(gsub(match, "", text4[1:100], perl = TRUE))
 system.time(gsub(match, "", text4[1:100], useBytes = TRUE))
 system.time(gsub(match, "", text4[1:100], perl = TRUE, useBytes = TRUE))

 # Fastest, but doesn't seem to grow linearly
 system.time(gsub(match, "", text4[1:1000], perl = TRUE, useBytes = TRUE))
 system.time(gsub(match, "", text4[1:2000], perl = TRUE, useBytes = TRUE))

 replace_all <- function(x, matches) {
  for (match in matches) {
    x <- gsub(match, "", x, fixed = TRUE)
  }
  x
 }

 # A little faster - but sacrifices word boundaries
 system.time(replace_all(text4[1:100], stopwords("english")))
 system.time(replace_all(text4[1:1000], stopwords("english")))
 system.time(replace_all(text4[1:2000], stopwords("english")))

 split_words <- function(x, matches) {
  xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
  removed <- lapply(xs, function(x) x[match(x, matches, 0L) == 0L])
  lapply(removed, paste, collapse = " ")
 }
 # Much much faster - but have I missed something?
 system.time(split_words(text4[1:100], stopwords("english")))
 system.time(split_words(text4[1:1000], stopwords("english")))
 system.time(split_words(text4[1:2000], stopwords("english")))

 # Can we make it even better with fastmatch package?
 library(fastmatch)
 split_words2 <- function(x, matches) {
  xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
  removed <- lapply(xs, function(x) x[fmatch(x, matches, 0L) == 0L])
  lapply(removed, paste, collapse = " ")
 }
 system.time(split_words2(text4[1:100], stopwords("english")))

 system.time(text5 <- split_words2(text4, stopwords("english")))
 # Yup, almost twice as fast

	library(stringr)
	library(tm)

	if (file.exists("english.rds")) {
	data <- readRDS("english.rds")
	} else {
	data <- read.csv("english_text.csv.gz", stringsAsFactors = FALSE)
	saveRDS(data, "english.rds")
	}
	text <- data$text

	# Encoding -------------------------------------------------------------------

	# Figure out which encoding is used in the original data
	n_failed <- function(x, enc) {
	reenc <- iconv(x, enc, "UTF-8")
	sum(is.na(reenc))
	}
	failures <- sapply(iconvlist(), n_failed, x = text[1:100])
	ok <- iconvlist()[failures == 0]

	conv <- vapply(ok, iconv, x = text[9], to = "UTF-8", character(1))
	str_sub(conv, 200, 240)

	conv <- vapply(ok, iconv, x = text[15], to = "UTF-8", character(1))
	str_sub(conv, -300, -200)
	# Doesn't seem to be consistently encoded

	# Just ram into ASCII
	system.time(text2 <- iconv(text, "UTF-8", "ASCII//TRANSLIT", "??"))

	# Strip whitespace, punctuation and stopwords --------------------------------

	# Which is fastest?
	strip_space <- function(x, ...) {
	gsub("([[:punct:]]\|[[:space:]])+", " ", x, ...)
	}
	system.time(text3 <- strip_space(text2[1:1000]))
	system.time(text3 <- strip_space(text2[1:1000], perl = TRUE))
	system.time(text3 <- strip_space(text2[1:1000], useBytes = TRUE))
	system.time(text3 <- strip_space(text2[1:1000], perl = TRUE, useBytes = TRUE))

	# Perl is fastest

	system.time({
	text3a <- gsub("[[:punct:]]+", "", text2, perl = TRUE)
	text3 <- gsub("[[:space:]]+", " ", text3a, perl = TRUE)
	})
	# Takes about 9 seconds all up

	system.time(text4 <- tolower(text3))
	# Takes another 2 seconds

	# Remove stopwords
	match <- sprintf("\\b(%s)\\b", paste(stopwords("english"), collapse = "\|"))

	# Explore speed again - I don't know why using bytes is now important.
	# It suggests that R doesn't realise that this isn't standard ascii text
	# But the results of Encoding are the same
	system.time(gsub(match, "", text4[1:100]))
	system.time(gsub(match, "", text4[1:100], perl = TRUE))
	system.time(gsub(match, "", text4[1:100], useBytes = TRUE))
	system.time(gsub(match, "", text4[1:100], perl = TRUE, useBytes = TRUE))

	# Fastest, but doesn't seem to grow linearly
	system.time(gsub(match, "", text4[1:1000], perl = TRUE, useBytes = TRUE))
	system.time(gsub(match, "", text4[1:2000], perl = TRUE, useBytes = TRUE))

	replace_all <- function(x, matches) {
	for (match in matches) {
	x <- gsub(match, "", x, fixed = TRUE)
	}
	x
	}

	# A little faster - but sacrifices word boundaries
	system.time(replace_all(text4[1:100], stopwords("english")))
	system.time(replace_all(text4[1:1000], stopwords("english")))
	system.time(replace_all(text4[1:2000], stopwords("english")))

	split_words <- function(x, matches) {
	xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
	removed <- lapply(xs, function(x) x[match(x, matches, 0L) == 0L])
	lapply(removed, paste, collapse = " ")
	}
	# Much much faster - but have I missed something?
	system.time(split_words(text4[1:100], stopwords("english")))
	system.time(split_words(text4[1:1000], stopwords("english")))
	system.time(split_words(text4[1:2000], stopwords("english")))

	# Can we make it even better with fastmatch package?
	library(fastmatch)
	split_words2 <- function(x, matches) {
	xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
	removed <- lapply(xs, function(x) x[fmatch(x, matches, 0L) == 0L])
	lapply(removed, paste, collapse = " ")
	}
	system.time(split_words2(text4[1:100], stopwords("english")))

	system.time(text5 <- split_words2(text4, stopwords("english")))
	# Yup, almost twice as fast