Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Created June 2, 2013 08:22
Show Gist options
  • Save benmarwick/5693017 to your computer and use it in GitHub Desktop.
Save benmarwick/5693017 to your computer and use it in GitHub Desktop.
Storing a DocumentTermMatrix on disk (ie. out of memory) using the filehash and ff packages in R
# test of storing a DTM on disk...
# reproducible data
library(tm)
data(crude)
dtm <- DocumentTermMatrix(crude)
library(filehash)
dbCreate("testDB")
db <- dbInit("testDB")
db$A <- vector("list", length = length(dtm$dimnames$Docs))
# matrix(nrow = length(dtm$dimnames$Docs), ncol = length(dtm$dimnames$Terms))
# store each doc as a vector on disk, in a list of vectors
for(i in 1:length(dtm$dimnames$Docs)) {
db$A[[as.character(i)]] <- as.vector(dtm[ dtm$dimnames$Docs[[i]], ])
}
# store each doc as a vector on disk, each doc is a row in a matrix
library(plyr)
db$A <- laply(1:length(dtm$dimnames$Docs), function(i) as.vector(dtm[ dtm$dimnames$Docs[[i]], ]), .progress = "text")
identical(unname(db$A[12,]), as.vector(dtm[ dtm$dimnames$Docs[[12]], ]) )
object.size(db); object.size(dtm)
dbList(db)
dbDelete(db, "A")
# how about with ff
# store each doc as a row in ff matrix
library(ff)
fdtm <- ff(0, dim=c(length(dtm$dimnames$Docs), length(dtm$dimnames$Terms)))
for(i in 1:length(dtm$dimnames$Docs)){
print(i)
fdtm[i,] <- as.vector(dtm[ dtm$dimnames$Docs[[i]], ])
}
# store each doc as a row in ff matrix
library(plyr)
l_ply(1:length(dtm$dimnames$Docs), function(i) fdtm[i,] <- as.vector(dtm[ dtm$dimnames$Docs[[i]], ]), .progress = "text")
# location of ff file
getOption("fftempdir")
identical(fdtm[12,], as.vector(dtm[ dtm$dimnames$Docs[[12]], ]) )
object.size(fdtm); object.size(dtm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment