Skip to content

Instantly share code, notes, and snippets.

@yosuke-yasuda
Last active June 3, 2016 20:08
Show Gist options
  • Save yosuke-yasuda/9aeab489ea4f204e612aeaf345ab24cf to your computer and use it in GitHub Desktop.
Save yosuke-yasuda/9aeab489ea4f204e612aeaf345ab24cf to your computer and use it in GitHub Desktop.
#' Get idf for terms
calc_idf <- function(document, term, log_scale = log, smooth_idf = FALSE){
loadNamespace("Matrix")
loadNamespace("text2vec")
if(length(document)!=length(term)){
stop("length of document and terms have to be the same")
}
doc_fact <- as.factor(document)
term_fact <- as.factor(term)
sparseMat <- Matrix::sparseMatrix(i = as.numeric(doc_fact), j = as.numeric(term_fact))
idf <- text2vec::get_idf(sparseMat, log_scale=log_scale, smooth_idf=smooth_idf)
idf <- idf@x[term_fact]
df <- Matrix::colSums(sparseMat)[term_fact]
data.frame(.df=df, .idf=idf)
}
#' Caluculate tf for non-standard evaluation
calc_tf <- function(tbl, document, term, ...){
document_col <- col_name(substitute(document))
term_col <- col_name(substitute(term))
calc_tf_(tbl, document_col, term_col, ...)
}
#' Caluculate tf for standard evaluation
#' private
calc_tf_ <- function(tbl, document_col, term_col, weight="ratio", k=0.5){
loadNamespace("lazyeval")
loadNamespace("dplyr")
loadNamespace("tidyr")
calc_weight <- function(df){
raw <- df$.tf
if(weight=="ratio"){
val <- raw/sum(raw)
} else if(weight=="raw_frequency"){
val <- raw
} else if (weight=="binary"){
val <- as.logical(raw)
} else if (weight=="log_normalization"){
val <- 1+log(raw)
} else if (weight=="k_normalization"){
val <- k + (1-k)*raw/max(raw)
}
else{
stop(paste0(weight, " is not recognized as weight argument"))
}
output <- data.frame(term=df[[term_col]], .tf = val)
colnames(output) <- c(term_col, ".tf")
output
}
count <- (
tbl[,colnames(tbl) == document_col | colnames(tbl)==term_col] %>%
dplyr::group_by_(document_col, term_col) %>%
dplyr::summarise(.tf = n()) %>%
dplyr::do(.tf = calc_weight(.)) %>%
tidyr::unnest(.tf)
)
dplyr::ungroup(count)
}
#' Caluculate tfidf
calc_tfidf <- function(tbl, document, term, idf_log_scale = log, smooth_idf = FALSE, tf_weight="ratio", tf_k=0.5){
loadNamespace("tidytext")
loadNamespace("dplyr")
document_col <- col_name(substitute(document))
term_col <- col_name(substitute(term))
count_tbl <- calc_tf_(tbl, document_col, term_col, weight=tf_weight, k=tf_k)
mat <- tidytext::cast_sparse_(count_tbl, document_col, term_col, ".tf")
tfidf <- calc_idf(count_tbl[[document_col]], count_tbl[[term_col]], log_scale = idf_log_scale, smooth_idf = smooth_idf)
count_tbl$.df <- tfidf$.df
count_tbl$.tfidf <- tfidf$.idf * count_tbl$.tf
count_tbl
}
#' Calc pair similarity
calc_sim <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="correlation"){
group_col <- col_name(substitute(group))
dimention_col <- col_name(substitute(dimention))
value_col <- col_name(substitute(value))
if(method=="cosine"){
loadNamespace("qlcMatrix")
loadNamespace("tidytext")
mat <- tbl %>% tidytext::cast_sparse_(dimention_col, group_col, value_col)
sim <- qlcMatrix::cosSparse(mat)
} else {
loadNamespace("proxy")
mat <- tbl %>% simple_cast(group_col, dimention_col, value_col)
sim <- proxy::simil(mat, method=method, diag=FALSE)
}
if(upper){
df <- upper_gather(sim, rownames(mat), diag=diag)
}else{
loadNamespace("dplyr")
df <- sim %>% as.matrix() %>% reshape2::melt()
if(!diag){
df <- dplyr::filter(df, Var1!=Var2)
}
}
colnames(df) <- c("group1", "group2", "sim")
df[,1] <- as.character(df[,1])
df[,2] <- as.character(df[,2])
df
}
#' Calc pair distance
calc_dist <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="Euclidean"){
loadNamespace("proxy")
group_col <- col_name(substitute(group))
dimention_col <- col_name(substitute(dimention))
value_col <- col_name(substitute(value))
mat <- tbl %>% simple_cast(group_col, dimention_col, value_col)
dist <- proxy::dist(mat, method=method, diag=FALSE)
if(upper){
df <- upper_gather(dist, rownames(mat), diag=diag)
}else{
loadNamespace("dplyr")
df <- dist %>% as.matrix() %>% reshape2::melt()
if(!diag){
df <- dplyr::filter(df, Var1!=Var2)
}
}
colnames(df) <- c("group1", "group2", "sim")
df[,1] <- as.character(df[,1])
df[,2] <- as.character(df[,2])
df
}
#' Column name parser
#' private
col_name <- function(x, default = stop("Please supply column name", call. = FALSE))
{
if (is.character(x))
return(x)
if (identical(x, quote(expr = )))
return(default)
if (is.name(x))
return(as.character(x))
if (is.null(x))
return(x)
stop("Invalid column specification", call. = FALSE)
}
#' Simple cast wrapper
#' private
simple_cast <- function(data, row, col, val, fun.aggregate=mean, fill=0){
loadNamespace("reshape2")
fml <- as.formula(paste(row, col, sep = "~"))
data %>% reshape2::acast(fml, value.var=val, fun.aggregate=fun.aggregate, fill=fill)
}
#' Gather only upper half
#' private
upper_gather <- function(mat, names=NULL, diag=NULL){
if(is.atomic(mat) & is.vector(mat)){
# This is basically for proxy::dist, simil functions
# It provides numeric vector of upper half
dim_size <- sqrt(2*length(mat)+1/4)+1/2
if(is.null(names)){
# detect dimension
names <- seq(dim_size)
} else {
if(length(names) != dim_size){
stop("number of names doesn't match matrix dimension")
}
}
loadNamespace("Matrix")
# create a triangler matrix to melt
trimat <- matrix(nrow=length(names), ncol=length(names))
trimat[row(trimat)<col(trimat)] <- as.numeric(mat)
colnames(trimat) <- names
rownames(trimat) <- names
if(!is.null(diag)){
trimat[row(trimat)==col(trimat)] = rep(diag, length(names))
}
reshape2::melt(trimat, na.rm=TRUE)
}else{
# diag can be NULL or FALSE
upper_tri <- upper.tri(mat, diag=!is.null(diag) && diag)
cnames <- colnames(mat)
rnames <- rownames(mat)
if(is.null(cnames)){
cnames <- seq(ncol(mat))
}
if(is.null(rnames)){
rnames <- seq(nrow(mat))
}
ind <- which( upper_tri , arr.ind = TRUE )
val <- mat[upper_tri]
data.frame(
Var1=rnames[ind[,1]],
Var2=cnames[ind[,2]],
value=val)
}
}
# Compress dimension
compress_dimension <- function(tbl, group, dimension, value, type="group", fill=0, fun.aggregate=mean, dim=NULL){
loadNamespace("reshape2")
group_col <- col_name(substitute(group))
dimension_col <- col_name(substitute(dimension))
value_col <- col_name(substitute(value))
fml <- as.formula(paste(group_col, dimension_col, sep="~"))
matrix <- reshape2::acast(tbl, fml, value.var=value_col, fill=fill, fun.aggregate=fun.aggregate)
dim <- min(dim, ncol(matrix))
if(type=="group"){
result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nu=dim, nv=0)
stdev <- result$d[seq(max(dim, length(result$d)))]
stdev[is.na(stdev)] <- 0
mat <- result$u %*% diag(result$d[seq(dim)])
rownames(mat) <- rownames(matrix)
result <- reshape2::melt(mat)
colnames(result) <- c("group", "component", "value")
} else if (type=="dimension") {
result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nv=dim, nu=0)
mat <- result$v
stdev <- result$d[seq(max(dim, length(result$d)))]
stdev[is.na(stdev)] <- 0
rownames(mat) <- colnames(matrix)
result <- reshape2::melt(mat)
colnames(result) <- c("dimension", "component", "value")
}
rep_sdev <- rep(stdev, each=nrow(result)/length(stdev))
result$stdev <- rep_sdev
result
}
wordstem <- function(...){
loadNamespace("quanteda")
quanteda::wordstem(...)
}
generate_ngrams <- function(tbl, group, token, n, skip){
loadNamespace("dplyr")
loadNamespace("tidyr")
loadNamespace("quanteda")
group_col <- col_name(substitute(group))
token_col <- col_name(substitute(token))
grouped <- (
tbl
%>% dplyr::group_by_(group_col))
indices <- attr(grouped, "indices")
labels <- attr(grouped, "labels")
labels[[token_col]] <- lapply(indices, function(index){
browser()
quanteda::skipgrams(as.character(tbl[[token_col]][index+1]), n=n, skip=skip)
})
unnested <- tidyr::unnest_(labels, token_col)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment