-
-
Save hadley/3085676 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(tm) | |
if (file.exists("english.rds")) { | |
data <- readRDS("english.rds") | |
} else { | |
data <- read.csv("english_text.csv.gz", stringsAsFactors = FALSE) | |
saveRDS(data, "english.rds") | |
} | |
text <- data$text | |
# Encoding ------------------------------------------------------------------- | |
# Figure out which encoding is used in the original data | |
n_failed <- function(x, enc) { | |
reenc <- iconv(x, enc, "UTF-8") | |
sum(is.na(reenc)) | |
} | |
failures <- sapply(iconvlist(), n_failed, x = text[1:100]) | |
ok <- iconvlist()[failures == 0] | |
conv <- vapply(ok, iconv, x = text[9], to = "UTF-8", character(1)) | |
str_sub(conv, 200, 240) | |
conv <- vapply(ok, iconv, x = text[15], to = "UTF-8", character(1)) | |
str_sub(conv, -300, -200) | |
# Doesn't seem to be consistently encoded | |
# Just ram into ASCII | |
system.time(text2 <- iconv(text, "UTF-8", "ASCII//TRANSLIT", "??")) | |
# Strip whitespace, punctuation and stopwords -------------------------------- | |
# Which is fastest? | |
strip_space <- function(x, ...) { | |
gsub("([[:punct:]]|[[:space:]])+", " ", x, ...) | |
} | |
system.time(text3 <- strip_space(text2[1:1000])) | |
system.time(text3 <- strip_space(text2[1:1000], perl = TRUE)) | |
system.time(text3 <- strip_space(text2[1:1000], useBytes = TRUE)) | |
system.time(text3 <- strip_space(text2[1:1000], perl = TRUE, useBytes = TRUE)) | |
# Perl is fastest | |
system.time({ | |
text3a <- gsub("[[:punct:]]+", "", text2, perl = TRUE) | |
text3 <- gsub("[[:space:]]+", " ", text3a, perl = TRUE) | |
}) | |
# Takes about 9 seconds all up | |
system.time(text4 <- tolower(text3)) | |
# Takes another 2 seconds | |
# Remove stopwords | |
match <- sprintf("\\b(%s)\\b", paste(stopwords("english"), collapse = "|")) | |
# Explore speed again - I don't know why using bytes is now important. | |
# It suggests that R doesn't realise that this isn't standard ascii text | |
# But the results of Encoding are the same | |
system.time(gsub(match, "", text4[1:100])) | |
system.time(gsub(match, "", text4[1:100], perl = TRUE)) | |
system.time(gsub(match, "", text4[1:100], useBytes = TRUE)) | |
system.time(gsub(match, "", text4[1:100], perl = TRUE, useBytes = TRUE)) | |
# Fastest, but doesn't seem to grow linearly | |
system.time(gsub(match, "", text4[1:1000], perl = TRUE, useBytes = TRUE)) | |
system.time(gsub(match, "", text4[1:2000], perl = TRUE, useBytes = TRUE)) | |
replace_all <- function(x, matches) { | |
for (match in matches) { | |
x <- gsub(match, "", x, fixed = TRUE) | |
} | |
x | |
} | |
# A little faster - but sacrifices word boundaries | |
system.time(replace_all(text4[1:100], stopwords("english"))) | |
system.time(replace_all(text4[1:1000], stopwords("english"))) | |
system.time(replace_all(text4[1:2000], stopwords("english"))) | |
split_words <- function(x, matches) { | |
xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE) | |
removed <- lapply(xs, function(x) x[match(x, matches, 0L) == 0L]) | |
lapply(removed, paste, collapse = " ") | |
} | |
# Much much faster - but have I missed something? | |
system.time(split_words(text4[1:100], stopwords("english"))) | |
system.time(split_words(text4[1:1000], stopwords("english"))) | |
system.time(split_words(text4[1:2000], stopwords("english"))) | |
# Can we make it even better with fastmatch package? | |
library(fastmatch) | |
split_words2 <- function(x, matches) { | |
xs <- strsplit(x, " ", fixed = TRUE, useBytes = TRUE) | |
removed <- lapply(xs, function(x) x[fmatch(x, matches, 0L) == 0L]) | |
lapply(removed, paste, collapse = " ") | |
} | |
system.time(split_words2(text4[1:100], stopwords("english"))) | |
system.time(text5 <- split_words2(text4, stopwords("english"))) | |
# Yup, almost twice as fast |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment