yosuke-yasuda · June 3, 2016 20:08
diff --git a/text_functions b/text_functions
 #' Get idf for terms
 calc_idf <- function(document, term, log_scale = log, smooth_idf = FALSE){
  loadNamespace("Matrix")
  loadNamespace("text2vec")
  if(length(document)!=length(term)){
    stop("length of document and terms have to be the same")
  }
  doc_fact <- as.factor(document)
  term_fact <- as.factor(term)
  sparseMat <- Matrix::sparseMatrix(i = as.numeric(doc_fact), j = as.numeric(term_fact))
  idf <- text2vec::get_idf(sparseMat, log_scale=log_scale, smooth_idf=smooth_idf)
  idf <- idf@x[term_fact]
  df <- Matrix::colSums(sparseMat)[term_fact]
  data.frame(.df=df, .idf=idf)
 }

 #' Caluculate tf for non-standard evaluation
 calc_tf <- function(tbl, document, term, ...){
  document_col <- col_name(substitute(document))
  term_col <- col_name(substitute(term))
  calc_tf_(tbl, document_col, term_col, ...)
 }

 #' Caluculate tf for standard evaluation
 #' private
 calc_tf_ <- function(tbl, document_col, term_col, weight="ratio", k=0.5){
  loadNamespace("lazyeval")
  loadNamespace("dplyr")
  loadNamespace("tidyr")
  calc_weight <- function(df){
    raw <- df$.tf
    if(weight=="ratio"){
      val <- raw/sum(raw)
    } else if(weight=="raw_frequency"){
      val <- raw
    } else if (weight=="binary"){
      val <- as.logical(raw)
    } else if (weight=="log_normalization"){
      val <- 1+log(raw)
    } else if (weight=="k_normalization"){
      val <- k + (1-k)*raw/max(raw)
    }
    else{
      stop(paste0(weight, " is not recognized as weight argument"))
    }

    output <- data.frame(term=df[[term_col]], .tf = val)
    colnames(output) <- c(term_col, ".tf")
    output
  }
  count <- (
    tbl[,colnames(tbl) == document_col | colnames(tbl)==term_col] %>%
    dplyr::group_by_(document_col, term_col) %>%
    dplyr::summarise(.tf = n()) %>%
    dplyr::do(.tf = calc_weight(.)) %>%
    tidyr::unnest(.tf)
    )

  dplyr::ungroup(count)
 }

 #' Caluculate tfidf
 calc_tfidf <- function(tbl, document, term, idf_log_scale = log, smooth_idf = FALSE, tf_weight="ratio", tf_k=0.5){
  loadNamespace("tidytext")
  loadNamespace("dplyr")
  document_col <- col_name(substitute(document))
  term_col <- col_name(substitute(term))
  count_tbl <- calc_tf_(tbl, document_col, term_col, weight=tf_weight, k=tf_k)
  mat <- tidytext::cast_sparse_(count_tbl, document_col, term_col, ".tf")
  tfidf <- calc_idf(count_tbl[[document_col]], count_tbl[[term_col]], log_scale = idf_log_scale, smooth_idf = smooth_idf)
  count_tbl$.df <- tfidf$.df
  count_tbl$.tfidf <- tfidf$.idf * count_tbl$.tf
  count_tbl
 }

 #' Calc pair similarity
 calc_sim <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="correlation"){
  group_col <- col_name(substitute(group))
  dimention_col <- col_name(substitute(dimention))
  value_col <- col_name(substitute(value))
  if(method=="cosine"){
    loadNamespace("qlcMatrix")
    loadNamespace("tidytext")
    mat <- tbl %>%  tidytext::cast_sparse_(dimention_col, group_col, value_col)
    sim <- qlcMatrix::cosSparse(mat)
  } else {
    loadNamespace("proxy")
    mat <- tbl %>%  simple_cast(group_col, dimention_col, value_col)
    sim <- proxy::simil(mat, method=method, diag=FALSE)
  }
  if(upper){
    df <- upper_gather(sim, rownames(mat), diag=diag)
  }else{
    loadNamespace("dplyr")
    df <- sim %>%  as.matrix() %>%  reshape2::melt()
    if(!diag){
      df <- dplyr::filter(df, Var1!=Var2)
    }
  }
  colnames(df) <- c("group1", "group2", "sim")
  df[,1] <- as.character(df[,1])
  df[,2] <- as.character(df[,2])
  df
 }

 #' Calc pair distance
 calc_dist <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="Euclidean"){
  loadNamespace("proxy")
  group_col <- col_name(substitute(group))
  dimention_col <- col_name(substitute(dimention))
  value_col <- col_name(substitute(value))
  mat <- tbl %>%  simple_cast(group_col, dimention_col, value_col)
  dist <- proxy::dist(mat, method=method, diag=FALSE)
  if(upper){
    df <- upper_gather(dist, rownames(mat), diag=diag)
  }else{
    loadNamespace("dplyr")
    df <- dist %>%  as.matrix() %>%  reshape2::melt()
    if(!diag){
      df <- dplyr::filter(df, Var1!=Var2)
    }
  }
  colnames(df) <- c("group1", "group2", "sim")
  df[,1] <- as.character(df[,1])
  df[,2] <- as.character(df[,2])
  df
 }

 #' Column name parser
 #' private
 col_name <- function(x, default = stop("Please supply column name", call. = FALSE))
 {
  if (is.character(x))
    return(x)
  if (identical(x, quote(expr = )))
    return(default)
  if (is.name(x))
    return(as.character(x))
  if (is.null(x))
    return(x)
  stop("Invalid column specification", call. = FALSE)
 }

 #' Simple cast wrapper
 #' private
 simple_cast <- function(data, row, col, val, fun.aggregate=mean, fill=0){
  loadNamespace("reshape2")
  fml <- as.formula(paste(row, col, sep = "~"))
  data %>%  reshape2::acast(fml, value.var=val, fun.aggregate=fun.aggregate, fill=fill)
 }

 #' Gather only upper half
 #' private
 upper_gather <- function(mat, names=NULL, diag=NULL){
  if(is.atomic(mat) & is.vector(mat)){
    # This is basically for proxy::dist, simil functions
    # It provides numeric vector of upper half
    dim_size <- sqrt(2*length(mat)+1/4)+1/2
    if(is.null(names)){
      # detect dimension
      names <- seq(dim_size)
    } else {
      if(length(names) != dim_size){
        stop("number of names doesn't match matrix dimension")
      }
    }
    loadNamespace("Matrix")
    # create a triangler matrix to melt
    trimat <- matrix(nrow=length(names), ncol=length(names))
    trimat[row(trimat)<col(trimat)] <- as.numeric(mat)
    colnames(trimat) <- names
    rownames(trimat) <- names
    if(!is.null(diag)){
      trimat[row(trimat)==col(trimat)] = rep(diag, length(names))
    }
    reshape2::melt(trimat, na.rm=TRUE)
  }else{
    # diag can be NULL or FALSE
    upper_tri <- upper.tri(mat, diag=!is.null(diag) && diag)
    cnames <- colnames(mat)
    rnames <- rownames(mat)
    if(is.null(cnames)){
      cnames <- seq(ncol(mat))
    }
    if(is.null(rnames)){
      rnames <- seq(nrow(mat))
    }
    ind <- which( upper_tri , arr.ind = TRUE )
    val <- mat[upper_tri]
    data.frame(
      Var1=rnames[ind[,1]],
      Var2=cnames[ind[,2]],
      value=val)
  }
 }

 # Compress dimension
 compress_dimension <- function(tbl, group, dimension, value, type="group", fill=0, fun.aggregate=mean, dim=NULL){
  loadNamespace("reshape2")
  group_col <- col_name(substitute(group))
  dimension_col <- col_name(substitute(dimension))
  value_col <- col_name(substitute(value))
  fml <- as.formula(paste(group_col, dimension_col, sep="~"))
  matrix <- reshape2::acast(tbl, fml, value.var=value_col, fill=fill, fun.aggregate=fun.aggregate)
  dim <- min(dim, ncol(matrix))
  if(type=="group"){
    result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nu=dim, nv=0)
    stdev <- result$d[seq(max(dim, length(result$d)))]
    stdev[is.na(stdev)] <- 0
    mat <- result$u %*% diag(result$d[seq(dim)])
    rownames(mat) <- rownames(matrix)
    result <- reshape2::melt(mat)
    colnames(result) <- c("group", "component", "value")
  } else if (type=="dimension") {
    result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nv=dim, nu=0)
    mat <- result$v
    stdev <- result$d[seq(max(dim, length(result$d)))]
    stdev[is.na(stdev)] <- 0
    rownames(mat) <- colnames(matrix)
    result <- reshape2::melt(mat)
    colnames(result) <- c("dimension", "component", "value")
  }
  rep_sdev <- rep(stdev, each=nrow(result)/length(stdev))
  result$stdev <- rep_sdev
  result
 }

 wordstem <- function(...){
  loadNamespace("quanteda")
  quanteda::wordstem(...)
 }

 generate_ngrams <- function(tbl, group, token, n, skip){
  loadNamespace("dplyr")
  loadNamespace("tidyr")
  loadNamespace("quanteda")
  group_col <- col_name(substitute(group))
  token_col <- col_name(substitute(token))
  grouped <- (
    tbl
    %>%  dplyr::group_by_(group_col))

  indices <- attr(grouped, "indices")
  labels <- attr(grouped, "labels")
  labels[[token_col]] <- lapply(indices, function(index){
    browser()
    quanteda::skipgrams(as.character(tbl[[token_col]][index+1]), n=n, skip=skip)
  })
  unnested <- tidyr::unnest_(labels, token_col)
 }
	#' Get idf for terms
	calc_idf <- function(document, term, log_scale = log, smooth_idf = FALSE){
	loadNamespace("Matrix")
	loadNamespace("text2vec")
	if(length(document)!=length(term)){
	stop("length of document and terms have to be the same")
	}
	doc_fact <- as.factor(document)
	term_fact <- as.factor(term)
	sparseMat <- Matrix::sparseMatrix(i = as.numeric(doc_fact), j = as.numeric(term_fact))
	idf <- text2vec::get_idf(sparseMat, log_scale=log_scale, smooth_idf=smooth_idf)
	idf <- idf@x[term_fact]
	df <- Matrix::colSums(sparseMat)[term_fact]
	data.frame(.df=df, .idf=idf)
	}

	#' Caluculate tf for non-standard evaluation
	calc_tf <- function(tbl, document, term, ...){
	document_col <- col_name(substitute(document))
	term_col <- col_name(substitute(term))
	calc_tf_(tbl, document_col, term_col, ...)
	}

	#' Caluculate tf for standard evaluation
	#' private
	calc_tf_ <- function(tbl, document_col, term_col, weight="ratio", k=0.5){
	loadNamespace("lazyeval")
	loadNamespace("dplyr")
	loadNamespace("tidyr")
	calc_weight <- function(df){
	raw <- df$.tf
	if(weight=="ratio"){
	val <- raw/sum(raw)
	} else if(weight=="raw_frequency"){
	val <- raw
	} else if (weight=="binary"){
	val <- as.logical(raw)
	} else if (weight=="log_normalization"){
	val <- 1+log(raw)
	} else if (weight=="k_normalization"){
	val <- k + (1-k)*raw/max(raw)
	}
	else{
	stop(paste0(weight, " is not recognized as weight argument"))
	}

	output <- data.frame(term=df[[term_col]], .tf = val)
	colnames(output) <- c(term_col, ".tf")
	output
	}
	count <- (
	tbl[,colnames(tbl) == document_col \| colnames(tbl)==term_col] %>%
	dplyr::group_by_(document_col, term_col) %>%
	dplyr::summarise(.tf = n()) %>%
	dplyr::do(.tf = calc_weight(.)) %>%
	tidyr::unnest(.tf)
	)

	dplyr::ungroup(count)
	}

	#' Caluculate tfidf
	calc_tfidf <- function(tbl, document, term, idf_log_scale = log, smooth_idf = FALSE, tf_weight="ratio", tf_k=0.5){
	loadNamespace("tidytext")
	loadNamespace("dplyr")
	document_col <- col_name(substitute(document))
	term_col <- col_name(substitute(term))
	count_tbl <- calc_tf_(tbl, document_col, term_col, weight=tf_weight, k=tf_k)
	mat <- tidytext::cast_sparse_(count_tbl, document_col, term_col, ".tf")
	tfidf <- calc_idf(count_tbl[[document_col]], count_tbl[[term_col]], log_scale = idf_log_scale, smooth_idf = smooth_idf)
	count_tbl$.df <- tfidf$.df
	count_tbl$.tfidf <- tfidf$.idf * count_tbl$.tf
	count_tbl
	}

	#' Calc pair similarity
	calc_sim <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="correlation"){
	group_col <- col_name(substitute(group))
	dimention_col <- col_name(substitute(dimention))
	value_col <- col_name(substitute(value))
	if(method=="cosine"){
	loadNamespace("qlcMatrix")
	loadNamespace("tidytext")
	mat <- tbl %>% tidytext::cast_sparse_(dimention_col, group_col, value_col)
	sim <- qlcMatrix::cosSparse(mat)
	} else {
	loadNamespace("proxy")
	mat <- tbl %>% simple_cast(group_col, dimention_col, value_col)
	sim <- proxy::simil(mat, method=method, diag=FALSE)
	}
	if(upper){
	df <- upper_gather(sim, rownames(mat), diag=diag)
	}else{
	loadNamespace("dplyr")
	df <- sim %>% as.matrix() %>% reshape2::melt()
	if(!diag){
	df <- dplyr::filter(df, Var1!=Var2)
	}
	}
	colnames(df) <- c("group1", "group2", "sim")
	df[,1] <- as.character(df[,1])
	df[,2] <- as.character(df[,2])
	df
	}

	#' Calc pair distance
	calc_dist <- function(tbl, group, dimention, value, upper=FALSE, diag=FALSE, method="Euclidean"){
	loadNamespace("proxy")
	group_col <- col_name(substitute(group))
	dimention_col <- col_name(substitute(dimention))
	value_col <- col_name(substitute(value))
	mat <- tbl %>% simple_cast(group_col, dimention_col, value_col)
	dist <- proxy::dist(mat, method=method, diag=FALSE)
	if(upper){
	df <- upper_gather(dist, rownames(mat), diag=diag)
	}else{
	loadNamespace("dplyr")
	df <- dist %>% as.matrix() %>% reshape2::melt()
	if(!diag){
	df <- dplyr::filter(df, Var1!=Var2)
	}
	}
	colnames(df) <- c("group1", "group2", "sim")
	df[,1] <- as.character(df[,1])
	df[,2] <- as.character(df[,2])
	df
	}

	#' Column name parser
	#' private
	col_name <- function(x, default = stop("Please supply column name", call. = FALSE))
	{
	if (is.character(x))
	return(x)
	if (identical(x, quote(expr = )))
	return(default)
	if (is.name(x))
	return(as.character(x))
	if (is.null(x))
	return(x)
	stop("Invalid column specification", call. = FALSE)
	}

	#' Simple cast wrapper
	#' private
	simple_cast <- function(data, row, col, val, fun.aggregate=mean, fill=0){
	loadNamespace("reshape2")
	fml <- as.formula(paste(row, col, sep = "~"))
	data %>% reshape2::acast(fml, value.var=val, fun.aggregate=fun.aggregate, fill=fill)
	}

	#' Gather only upper half
	#' private
	upper_gather <- function(mat, names=NULL, diag=NULL){
	if(is.atomic(mat) & is.vector(mat)){
	# This is basically for proxy::dist, simil functions
	# It provides numeric vector of upper half
	dim_size <- sqrt(2*length(mat)+1/4)+1/2
	if(is.null(names)){
	# detect dimension
	names <- seq(dim_size)
	} else {
	if(length(names) != dim_size){
	stop("number of names doesn't match matrix dimension")
	}
	}
	loadNamespace("Matrix")
	# create a triangler matrix to melt
	trimat <- matrix(nrow=length(names), ncol=length(names))
	trimat[row(trimat)<col(trimat)] <- as.numeric(mat)
	colnames(trimat) <- names
	rownames(trimat) <- names
	if(!is.null(diag)){
	trimat[row(trimat)==col(trimat)] = rep(diag, length(names))
	}
	reshape2::melt(trimat, na.rm=TRUE)
	}else{
	# diag can be NULL or FALSE
	upper_tri <- upper.tri(mat, diag=!is.null(diag) && diag)
	cnames <- colnames(mat)
	rnames <- rownames(mat)
	if(is.null(cnames)){
	cnames <- seq(ncol(mat))
	}
	if(is.null(rnames)){
	rnames <- seq(nrow(mat))
	}
	ind <- which( upper_tri , arr.ind = TRUE )
	val <- mat[upper_tri]
	data.frame(
	Var1=rnames[ind[,1]],
	Var2=cnames[ind[,2]],
	value=val)
	}
	}

	# Compress dimension
	compress_dimension <- function(tbl, group, dimension, value, type="group", fill=0, fun.aggregate=mean, dim=NULL){
	loadNamespace("reshape2")
	group_col <- col_name(substitute(group))
	dimension_col <- col_name(substitute(dimension))
	value_col <- col_name(substitute(value))
	fml <- as.formula(paste(group_col, dimension_col, sep="~"))
	matrix <- reshape2::acast(tbl, fml, value.var=value_col, fill=fill, fun.aggregate=fun.aggregate)
	dim <- min(dim, ncol(matrix))
	if(type=="group"){
	result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nu=dim, nv=0)
	stdev <- result$d[seq(max(dim, length(result$d)))]
	stdev[is.na(stdev)] <- 0
	mat <- result$u %*% diag(result$d[seq(dim)])
	rownames(mat) <- rownames(matrix)
	result <- reshape2::melt(mat)
	colnames(result) <- c("group", "component", "value")
	} else if (type=="dimension") {
	result <- svd(sweep(matrix, 2, colMeans(matrix), "-"), nv=dim, nu=0)
	mat <- result$v
	stdev <- result$d[seq(max(dim, length(result$d)))]
	stdev[is.na(stdev)] <- 0
	rownames(mat) <- colnames(matrix)
	result <- reshape2::melt(mat)
	colnames(result) <- c("dimension", "component", "value")
	}
	rep_sdev <- rep(stdev, each=nrow(result)/length(stdev))
	result$stdev <- rep_sdev
	result
	}

	wordstem <- function(...){
	loadNamespace("quanteda")
	quanteda::wordstem(...)
	}

	generate_ngrams <- function(tbl, group, token, n, skip){
	loadNamespace("dplyr")
	loadNamespace("tidyr")
	loadNamespace("quanteda")
	group_col <- col_name(substitute(group))
	token_col <- col_name(substitute(token))
	grouped <- (
	tbl
	%>% dplyr::group_by_(group_col))

	indices <- attr(grouped, "indices")
	labels <- attr(grouped, "labels")
	labels[[token_col]] <- lapply(indices, function(index){
	browser()
	quanteda::skipgrams(as.character(tbl[[token_col]][index+1]), n=n, skip=skip)
	})
	unnested <- tidyr::unnest_(labels, token_col)
	}