Skip to content

Instantly share code, notes, and snippets.

@Gerjo
Created September 19, 2014 15:20
Show Gist options
  • Save Gerjo/efe70ce9bd0da9596d8e to your computer and use it in GitHub Desktop.
Save Gerjo/efe70ce9bd0da9596d8e to your computer and use it in GitHub Desktop.
Latent semantic analysis and singular value decomposition to automatically determine a comment's relevancy to the topic it was posted to. Conclusion: it doesn't work yet, more research required.
cat("\014")
library(base)
if(require(tm) == FALSE) {
print("Attempting to install TM support...");
install.packages("tm")
library(tm);
}
if(require(RSQLite) == FALSE) {
print("Attempting to install SQLite support...");
install.packages("RSQLite")
library(RSQLite);
}
# Run a SQLite query and capture the result as a data frame.
runQuery <- function(sql) {
tries <- 10
# Give it a few tries. SQLite likes to give concurrency errors, e.g., when
# browsing the data during an insert (table level locking, not row or column).
while(tries > 0) {
tryCatch(
{
res <- dbGetQuery(db, sql)
tries <- 0
},
error = function(e) {
cat(paste("Query failed, attempts left:", tries, "\n"))
Sys.sleep(1)
}
)
tries <- tries - 1
}
return(res)
}
# SVD with diagonal trimming, i.e., reduction of dimensions.
reduce <- function(mat, trimCount) {
# Appl singular value decomposition
res <- svd(mat)
# Retrieve diagonal matrix as an actual diagonal.
s <- diag(res$d)
#print(round(s, digits = 3))
# Set last n diagonal values to zero
for(i in 1 : nrow(s)) {
if(i > nrow(s) - trimCount) {
s[i,i] <- 0;
cat("null")
}
}
reconstructed <- res$u %*% s %*% t(res$v)
# Hack to overcome floating point issues. Numbers randomly change sign?
# R relies on underlying libraries, there is no unified way of dealing
# with irrational numbers, it would seem. Random stability!
reconstructed <- round(reconstructed, digits = 10)
# Reconstruct, with reduced dimensions
return (reconstructed)
}
# Nicely hardcoded paths.
workdir <- "~/chamber/mmr/"
if( ! file.exists(workdir)) {
warning(paste("Working directory", workdir, "does not exist. The database must be placed in this path."))
}
setwd(workdir)
# Connect to the database
sqlite <- dbDriver("SQLite")
db <- dbConnect(sqlite, "./documents.sqlite")
documents <- runQuery("SELECT tweakersid, title, document FROM documents");
for(i in 1 : nrow(documents)) {
document <- documents[i, ]
comments <- runQuery(paste("SELECT score, document FROM comments WHERE moderated <> '0' AND tweakersid = ", document$tweakersid, sep = ""));
# Merge document and comments into one space
documentSpace <- c(document$document, comments["document"], recursive = TRUE)
corpus <- VCorpus(VectorSource(documentSpace))
# Filters and such
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("dutch"))
# Count the words
dtm <- DocumentTermMatrix(corpus)
# Reduce data to two dimensions
dtm = reduce(dtm, nrow(dtm) - 2)
#dtm = reduce(dtm, 50)
# Spearman's Rho correlation matrix (Filled with empty strings)
spearman <- matrix("", nrow(dtm), 2) # m x 1
# Compute compute compute!
for(i in 1 : nrow(spearman)) {
a <- as.vector(dtm[1, ])
b <- as.vector(dtm[i, ])
if(max(a) == 0 && min(a) == 0) {
cat("Warning: null a vector.\n")
next;
}
# Dimension reduction yields null vectors?
if(max(b) == 0 && min(b) == 0) {
cat("Warning: null b vector.\n")
next;
}
# Compute spearman's Rho correlation factor
c <- cor(a, b, method = "spearman")
# ... or use the default correlation coefficient
#c <- cor(a, b)
spearman[i, 1] <- round(c, digits = 2)
if(i > 1) {
# lookup the score according to tweakers.net
spearman[i, 2] <- comments[i - 1, "score"]
}
}
# Pretty column names.
colnames(spearman) <- c("Spearman's rank correlation coefficient", "Tweakers mod-Score")
# Pretty plot.
plot(spearman, xlim=c(-1, 1), ylim=c(-1, 2))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment