Skip to content

Instantly share code, notes, and snippets.

@hadley
Created July 10, 2012 19:28
Show Gist options
  • Save hadley/3085676 to your computer and use it in GitHub Desktop.
Save hadley/3085676 to your computer and use it in GitHub Desktop.
library(stringr)
library(tm)
if (file.exists("english.rds")) {
data <- readRDS("english.rds")
} else {
data <- read.csv("english_text.csv.gz", stringsAsFactors = FALSE)
saveRDS(data, "english.rds")
}
text <- data$text
# Encoding -------------------------------------------------------------------
# Figure out which encoding is used in the original data
n_failed <- function(x, enc) {
reenc <- iconv(x, enc, "UTF-8")
sum(is.na(reenc))
}
failures <- sapply(iconvlist(), n_failed, x = text[1:100])
ok <- iconvlist()[failures == 0]
conv <- vapply(ok, iconv, x = text[9], to = "UTF-8", character(1))
str_sub(conv, 200, 240)
conv <- vapply(ok, iconv, x = text[15], to = "UTF-8", character(1))
str_sub(conv, -300, -200)
# Doesn't seem to be consistently encoded
# Just ram into ASCII
system.time(text2 <- iconv(text, "UTF-8", "ASCII//TRANSLIT", "??"))
# Strip whitespace, punctuation and stopwords --------------------------------
# Which is fastest?
strip_space <- function(x, ...) {
gsub("([[:punct:]]|[[:space:]])+", " ", x, ...)
}
system.time(text3 <- strip_space(text2[1:1000]))
system.time(text3 <- strip_space(text2[1:1000], perl = TRUE))
system.time(text3 <- strip_space(text2[1:1000], useBytes = TRUE))
system.time(text3 <- strip_space(text2[1:1000], perl = TRUE, useBytes = TRUE))
# Perl is fastest
system.time({
text3a <- gsub("[[:punct:]]+", "", text2, perl = TRUE)
text3 <- gsub("[[:space:]]+", " ", text3a, perl = TRUE)
})
# Takes about 9 seconds all up
system.time(text4 <- tolower(text3))
# Takes another 2 seconds
# Remove stopwords
match <- sprintf("\\b(%s)\\b", paste(stopwords("english"), collapse = "|"))
# Explore speed again - I don't know why using bytes is now important.
# It suggests that R doesn't realise that this isn't standard ascii text
# But the results of Encoding are the same
system.time(gsub(match, "", text4[1:100]))
system.time(gsub(match, "", text4[1:100], perl = TRUE))
system.time(gsub(match, "", text4[1:100], useBytes = TRUE))
system.time(gsub(match, "", text4[1:100], perl = TRUE, useBytes = TRUE))
# Fastest, but doesn't seem to grow linearly
system.time(gsub(match, "", text4[1:1000], perl = TRUE, useBytes = TRUE))
system.time(gsub(match, "", text4[1:2000], perl = TRUE, useBytes = TRUE))
replace_all <- function(x, matches) {
for (match in matches) {
x <- gsub(match, "", x, fixed = TRUE)
}
x
}
# A little faster - but sacrifices word boundaries
system.time(replace_all(text4[1:100], stopwords("english")))
system.time(replace_all(text4[1:1000], stopwords("english")))
system.time(replace_all(text4[1:2000], stopwords("english")))
split_words <- function(x, matches) {
xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
removed <- lapply(xs, function(x) x[match(x, matches, 0L) == 0L])
lapply(removed, paste, collapse = " ")
}
# Much much faster - but have I missed something?
system.time(split_words(text4[1:100], stopwords("english")))
system.time(split_words(text4[1:1000], stopwords("english")))
system.time(split_words(text4[1:2000], stopwords("english")))
# Can we make it even better with fastmatch package?
library(fastmatch)
split_words2 <- function(x, matches) {
xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE)
removed <- lapply(xs, function(x) x[fmatch(x, matches, 0L) == 0L])
lapply(removed, paste, collapse = " ")
}
system.time(split_words2(text4[1:100], stopwords("english")))
system.time(text5 <- split_words2(text4, stopwords("english")))
# Yup, almost twice as fast
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment