Last active
June 3, 2016 20:08
-
-
Save yosuke-yasuda/9aeab489ea4f204e612aeaf345ab24cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Get idf for terms | |
calc_idf <- function(document, term, log_scale = log, smooth_idf = FALSE){ | |
loadNamespace("Matrix") | |
loadNamespace("text2vec") | |
if(length(document)!=length(term)){ | |
stop("length of document and terms have to be the same") | |
} | |
doc_fact <- as.factor(document) | |
term_fact <- as.factor(term) | |
sparseMat <- Matrix::sparseMatrix(i = as.numeric(doc_fact), j = as.numeric(term_fact)) | |
idf <- text2vec::get_idf(sparseMat, log_scale=log_scale, smooth_idf=smooth_idf) | |
idf <- idf@x[term_fact] | |
df <- Matrix::colSums(sparseMat)[term_fact] | |
data.frame(.df=df, .idf=idf) | |
} | |
#' Caluculate tf for non-standard evaluation | |
calc_tf <- function(tbl, document, term, ...){ | |
document_col <- col_name(substitute(document)) | |
term_col <- col_name(substitute(term)) | |
calc_tf_(tbl, document_col, term_col, ...) | |
} | |
#' Caluculate tf for standard evaluation | |
#' private | |
calc_tf_ <- function(tbl, document_col, term_col, weight="ratio", k=0.5){ | |
loadNamespace("lazyeval") | |
loadNamespace("dplyr") | |
loadNamespace("tidyr") | |
calc_weight <- function(df){ | |
raw <- df$.tf | |
if(weight=="ratio"){ | |
val <- raw/sum(raw) | |
} else if(weight=="raw_frequency"){ | |
val <- raw | |
} else if (weight=="binary"){ | |
val <- as.logical(raw) | |
} else if (weight=="log_normalization"){ | |
val <- 1+log(raw) | |
} else if (weight=="k_normalization"){ | |
val <- k + (1-k)*raw/max(raw) | |
} | |
else{ | |
stop(paste0(weight, " is not recognized as weight argument")) | |
} | |
output <- data.frame(term=df[[term_col]], .tf = val) | |
colnames(output) <- c(term_col, ".tf") | |
output | |
} | |
count <- ( | |
tbl[,colnames(tbl) == document_col | colnames(tbl)==term_col] %>% | |
dplyr::group_by_(document_col, term_col) %>% | |
dplyr::summarise(.tf = n()) %>% | |
dplyr::do(.tf = calc_weight(.)) %>% | |
tidyr::unnest(.tf) | |
) | |
dplyr::ungroup(count) | |
} | |
#' Caluculate tfidf | |
calc_tfidf <- function(tbl, document, term, idf_log_scale = log, smooth_idf = FALSE, tf_weight="ratio", tf_k=0.5){ | |
loadNamespace("tidytext") | |
loadNamespace("dplyr") | |
document_col <- col_name(substitute(document)) | |
term_col <- col_name(substitute(term)) | |
count_tbl <- calc_tf_(tbl, document_col, term_col, weight=tf_weight, k=tf_k) | |
mat <- tidytext::cast_sparse_(count_tbl, document_col, term_col, ".tf") | |
tfidf <- calc_idf(count_tbl[[document_col]], count_tbl[[term_col]], log_scale = idf_log_scale, smooth_idf = smooth_idf) | |
count_tbl$.df <- tfidf$.df | |
count_tbl$.tfidf <- tfidf$.idf * count_tbl$.tf | |
count_tbl | |
} | |
#' Calc pair similarity | |
calc_sim <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="correlation"){ | |
group_col <- col_name(substitute(group)) | |
dimention_col <- col_name(substitute(dimention)) | |
value_col <- col_name(substitute(value)) | |
if(method=="cosine"){ | |
loadNamespace("qlcMatrix") | |
loadNamespace("tidytext") | |
mat <- tbl %>% tidytext::cast_sparse_(dimention_col, group_col, value_col) | |
sim <- qlcMatrix::cosSparse(mat) | |
} else { | |
loadNamespace("proxy") | |
mat <- tbl %>% simple_cast(group_col, dimention_col, value_col) | |
sim <- proxy::simil(mat, method=method, diag=FALSE) | |
} | |
if(upper){ | |
df <- upper_gather(sim, rownames(mat), diag=diag) | |
}else{ | |
loadNamespace("dplyr") | |
df <- sim %>% as.matrix() %>% reshape2::melt() | |
if(!diag){ | |
df <- dplyr::filter(df, Var1!=Var2) | |
} | |
} | |
colnames(df) <- c("group1", "group2", "sim") | |
df[,1] <- as.character(df[,1]) | |
df[,2] <- as.character(df[,2]) | |
df | |
} | |
#' Calc pair distance | |
calc_dist <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="Euclidean"){ | |
loadNamespace("proxy") | |
group_col <- col_name(substitute(group)) | |
dimention_col <- col_name(substitute(dimention)) | |
value_col <- col_name(substitute(value)) | |
mat <- tbl %>% simple_cast(group_col, dimention_col, value_col) | |
dist <- proxy::dist(mat, method=method, diag=FALSE) | |
if(upper){ | |
df <- upper_gather(dist, rownames(mat), diag=diag) | |
}else{ | |
loadNamespace("dplyr") | |
df <- dist %>% as.matrix() %>% reshape2::melt() | |
if(!diag){ | |
df <- dplyr::filter(df, Var1!=Var2) | |
} | |
} | |
colnames(df) <- c("group1", "group2", "sim") | |
df[,1] <- as.character(df[,1]) | |
df[,2] <- as.character(df[,2]) | |
df | |
} | |
#' Column name parser | |
#' private | |
col_name <- function(x, default = stop("Please supply column name", call. = FALSE)) | |
{ | |
if (is.character(x)) | |
return(x) | |
if (identical(x, quote(expr = ))) | |
return(default) | |
if (is.name(x)) | |
return(as.character(x)) | |
if (is.null(x)) | |
return(x) | |
stop("Invalid column specification", call. = FALSE) | |
} | |
#' Simple cast wrapper | |
#' private | |
simple_cast <- function(data, row, col, val, fun.aggregate=mean, fill=0){ | |
loadNamespace("reshape2") | |
fml <- as.formula(paste(row, col, sep = "~")) | |
data %>% reshape2::acast(fml, value.var=val, fun.aggregate=fun.aggregate, fill=fill) | |
} | |
#' Gather only upper half | |
#' private | |
upper_gather <- function(mat, names=NULL, diag=NULL){ | |
if(is.atomic(mat) & is.vector(mat)){ | |
# This is basically for proxy::dist, simil functions | |
# It provides numeric vector of upper half | |
dim_size <- sqrt(2*length(mat)+1/4)+1/2 | |
if(is.null(names)){ | |
# detect dimension | |
names <- seq(dim_size) | |
} else { | |
if(length(names) != dim_size){ | |
stop("number of names doesn't match matrix dimension") | |
} | |
} | |
loadNamespace("Matrix") | |
# create a triangler matrix to melt | |
trimat <- matrix(nrow=length(names), ncol=length(names)) | |
trimat[row(trimat)<col(trimat)] <- as.numeric(mat) | |
colnames(trimat) <- names | |
rownames(trimat) <- names | |
if(!is.null(diag)){ | |
trimat[row(trimat)==col(trimat)] = rep(diag, length(names)) | |
} | |
reshape2::melt(trimat, na.rm=TRUE) | |
}else{ | |
# diag can be NULL or FALSE | |
upper_tri <- upper.tri(mat, diag=!is.null(diag) && diag) | |
cnames <- colnames(mat) | |
rnames <- rownames(mat) | |
if(is.null(cnames)){ | |
cnames <- seq(ncol(mat)) | |
} | |
if(is.null(rnames)){ | |
rnames <- seq(nrow(mat)) | |
} | |
ind <- which( upper_tri , arr.ind = TRUE ) | |
val <- mat[upper_tri] | |
data.frame( | |
Var1=rnames[ind[,1]], | |
Var2=cnames[ind[,2]], | |
value=val) | |
} | |
} | |
# Compress dimension | |
compress_dimension <- function(tbl, group, dimension, value, type="group", fill=0, fun.aggregate=mean, dim=NULL){ | |
loadNamespace("reshape2") | |
group_col <- col_name(substitute(group)) | |
dimension_col <- col_name(substitute(dimension)) | |
value_col <- col_name(substitute(value)) | |
fml <- as.formula(paste(group_col, dimension_col, sep="~")) | |
matrix <- reshape2::acast(tbl, fml, value.var=value_col, fill=fill, fun.aggregate=fun.aggregate) | |
dim <- min(dim, ncol(matrix)) | |
if(type=="group"){ | |
result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nu=dim, nv=0) | |
stdev <- result$d[seq(max(dim, length(result$d)))] | |
stdev[is.na(stdev)] <- 0 | |
mat <- result$u %*% diag(result$d[seq(dim)]) | |
rownames(mat) <- rownames(matrix) | |
result <- reshape2::melt(mat) | |
colnames(result) <- c("group", "component", "value") | |
} else if (type=="dimension") { | |
result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nv=dim, nu=0) | |
mat <- result$v | |
stdev <- result$d[seq(max(dim, length(result$d)))] | |
stdev[is.na(stdev)] <- 0 | |
rownames(mat) <- colnames(matrix) | |
result <- reshape2::melt(mat) | |
colnames(result) <- c("dimension", "component", "value") | |
} | |
rep_sdev <- rep(stdev, each=nrow(result)/length(stdev)) | |
result$stdev <- rep_sdev | |
result | |
} | |
wordstem <- function(...){ | |
loadNamespace("quanteda") | |
quanteda::wordstem(...) | |
} | |
generate_ngrams <- function(tbl, group, token, n, skip){ | |
loadNamespace("dplyr") | |
loadNamespace("tidyr") | |
loadNamespace("quanteda") | |
group_col <- col_name(substitute(group)) | |
token_col <- col_name(substitute(token)) | |
grouped <- ( | |
tbl | |
%>% dplyr::group_by_(group_col)) | |
indices <- attr(grouped, "indices") | |
labels <- attr(grouped, "labels") | |
labels[[token_col]] <- lapply(indices, function(index){ | |
browser() | |
quanteda::skipgrams(as.character(tbl[[token_col]][index+1]), n=n, skip=skip) | |
}) | |
unnested <- tidyr::unnest_(labels, token_col) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment