trinker · January 10, 2019 13:54
diff --git a/fuzzy_map.R b/fuzzy_map.R
 fuzzy_map <- function(x, y, distance = 'cosine', cutoff = .40,
                      remove = c('the', 'a', 'an', 'at', 'of', 'in', '-', ' - CL', ' - OS'),
                      substitution = data.frame(
                        pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
                        replacement = c('university', 'saint', 'mount', ' and '),
                        stringsAsFactors = FALSE
                      ),
                      ngrams = 1, # choose 1 or 2 (2 is 2 words)
                      downweight.words = c('suny', 'cuny', 'college', 'university', 'state'),
                      # words that are less important - make smaller to make the words LESS important
                      downweight = .25, ...
 ){

  cutoff <- 1 - cutoff
  if (!is.null(remove)) remove <- tolower(remove)
  if (!is.null(downweight)) downweight.words <- tolower(downweight.words)
  pat <- substitution[['pattern']]
  rep <- substitution[['replacement']]


  datx <- dplyr::data_frame(
    x = x,
    xl = tolower(x)
  )
  daty <- dplyr::data_frame(
    y = y,
    yl = tolower(y)
  )

  xu <- unique(datx[['xl']])
  yu <- unique(daty[['yl']])

  ## make a document term matrix of all x values (left)
  left <- dtm_it(xu, pattern = pat, replacement = rep, ngrams = ngrams)

  ## make a one-row document term matrix for each y value (right)
  rights <- lapply(yu, function(y_val) {
    o <- dtm_it(y_val, pattern = pat, replacement = rep, ngrams = ngrams)
    rownames(o) <- 'right'
    o
  })

  ## get the distance measure function
  dm <- distance_measure(distance)

  ## loop throu the y (right) document term matrices
  ##   1. combine witht he x document term matrix
  ##   2. weight as tf-idf
  ##   3. compute simialrity
  locs <- unlist(lapply(rights, function(r){

    dtm <- c(left, r)
    dtm <- tm::weightTfIdf(dtm)

    dww <- downweight.words[downweight.words %in% colnames(dtm)]
    if (!is.null(dww) && length(dww) > 0) {
      dtm <- cbind(
        dtm[, !colnames(dtm) %in% dww],
        dtm[,dww]*downweight
      )
    }



    distmat <- dm(dtm)

    vals <- unlist(as.matrix(distmat)[nrow(dtm),  -nrow(dtm)])
    if(any(vals <= cutoff)){
      which.min(vals)
    } else {
      NA
    }

  }), use.names = FALSE)

  xu_misses <- !seq_len(length(xu)) %in% locs

  lower_key <- dplyr::data_frame(
    xl = c(xu[locs], xu[xu_misses]),
    yl = c(yu, rep(NA, sum(xu_misses, na.rm = TRUE)))
  )

  dplyr::select(dplyr::full_join(
    dplyr::left_join(datx, lower_key, by = 'xl'),
    daty, by = 'yl'
  ), x, y)
 }

 ## look at bi vs unigram
 dtm_it <- function(x2, pattern, replacement, ngrams = 1, ...){

  if (!is.null(remove)) {
    x2 <- textclean::replace_tokens(x2, remove)
  }
  if (!is.null(remove)) {
    x2 <- textclean::mgsub(x2,
                           pattern = pattern,
                           replacement = replacement,
                           fixed = FALSE
    )
  }


  quanteda::as.DocumentTermMatrix(quanteda::dfm(quanteda::tokens_tolower(
    quanteda::tokens(
      x2,
      remove_punct = FALSE,
      remove_numbers = TRUE,
      remove_hyphens = TRUE,
      ngrams = ngrams
    )
  )))
 }

 distance_measure <- function(dist){
  switch(dist,
         'cosine' = cosine_distance,
         'jaccard' = jaccard_distance,
         stop('provide a valid `distance` type')
  )
 }

 #' Optimized Computation of Cosine Distance
 #'
 #' Utilizes the \pkg{slam} package to efficiently calculate cosine distance
 #' on large sparse matrices.
 #'
 #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
 #' \code{\link[tm]{TermDocumentMatrix}}).
 #' @param \ldots ignored.
 #' @return Returns a cosine distance object of class \code{"dist"}.
 #' @references \url{http://stackoverflow.com/a/29755756/1000343}
 #' @keywords cosine dissimilarity
 #' @rdname cosine_distance
 #' @export
 #' @author Michael Andrec and Tyler Rinker <tyler.rinker@@gmail.com>.
 #' @examples
 #' library(gofastr)
 #' library(dplyr)
 #'
 #' out <- presidential_debates_2012 %>%
 #'     with(q_dtm(dialogue)) %>%
 #'     cosine_distance()
 cosine_distance <- function(x, ...){
  UseMethod("cosine_distance")
 }


 #' @export
 #' @rdname cosine_distance
 #' @method cosine_distance DocumentTermMatrix
 cosine_distance.DocumentTermMatrix <- function(x, ...){
  cosine_distance(slam::as.simple_triplet_matrix(x))
 }

 #' @export
 #' @rdname cosine_distance
 #' @method cosine_distance simple_triplet_matrix
 cosine_distance.simple_triplet_matrix <- function(x, ...){
  x <- t(x)
  stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2)))))
 }

 #' Optimized Computation of Jaccard Distance
 #'
 #' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance
 #' on large sparse matrices.
 #'
 #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
 #' \code{\link[tm]{TermDocumentMatrix}}).
 #' @param \ldots ignored.
 #' @return Returns a jaccard distance object of class \code{"dist"}.
 #' @references \url{http://stackoverflow.com/a/36373333/1000343}
 #' \url{http://stats.stackexchange.com/a/89947/7482}
 #' @keywords jaccard dissimilarity
 #' @rdname jaccard_distance
 #' @export
 #' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker <tyler.rinker@@gmail.com>.
 #' @examples
 #' library(gofastr)
 #' library(dplyr)
 #'
 #' out <- presidential_debates_2012 %>%
 #'     with(q_dtm(dialogue)) %>%
 #'     jaccard_distance()
 jaccard_distance <- function(x, ...){
  UseMethod("jaccard_distance")
 }


 #' @export
 #' @rdname jaccard_distance
 #' @method jaccard_distance DocumentTermMatrix
 jaccard_distance.DocumentTermMatrix <- function(x, ...){
  mat <-sign(x)
  A <- slam::tcrossprod_simple_triplet_matrix(mat)
  im <- which(A > 0, arr.ind=TRUE)
  b <- slam::row_sums(mat)
  Aim <- A[im]

  stats::as.dist(1 - Matrix::sparseMatrix(
    i = im[,1],
    j = im[,2],
    x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
    dims = dim(A)
  ))
 }









 # x <- c('The university at st. charles', 'SUNY Berkshire', 'The univ. at Lockhaven-Pennslyvania',
 #       'Buffalo State', 'University at Buffalo', 'CUNY University-Boulder', 'Harvard College',
 #       'The university at st. charles')


 # y <- c('university at saint charles', 'Berkshire State College', 'University at Lockhaven',
 #       'SUNY Buffalo State College', 'SUNY University at Buffalo', 'University at Boulder', 'Yale University')

 ## drop articles
 ## abbreviation replacement

 # install.packages('gofastr')


 # cutoff <- .40
 # remove <- c('the', 'a', 'an', 'at', 'of', 'in', '-')
 # distance <- 'cosine'
 # ngrams <- 1
 # substitution = data.frame(
 #   pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
 #   replacement = c('university', 'saint', 'mount', ' and '),
 #   stringsAsFactors = FALSE
 # )




 # fuzzy_map(x, y)
	fuzzy_map <- function(x, y, distance = 'cosine', cutoff = .40,
	remove = c('the', 'a', 'an', 'at', 'of', 'in', '-', ' - CL', ' - OS'),
	substitution = data.frame(
	pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
	replacement = c('university', 'saint', 'mount', ' and '),
	stringsAsFactors = FALSE
	),
	ngrams = 1, # choose 1 or 2 (2 is 2 words)
	downweight.words = c('suny', 'cuny', 'college', 'university', 'state'),
	# words that are less important - make smaller to make the words LESS important
	downweight = .25, ...
	){

	cutoff <- 1 - cutoff
	if (!is.null(remove)) remove <- tolower(remove)
	if (!is.null(downweight)) downweight.words <- tolower(downweight.words)
	pat <- substitution[['pattern']]
	rep <- substitution[['replacement']]


	datx <- dplyr::data_frame(
	x = x,
	xl = tolower(x)
	)
	daty <- dplyr::data_frame(
	y = y,
	yl = tolower(y)
	)

	xu <- unique(datx[['xl']])
	yu <- unique(daty[['yl']])

	## make a document term matrix of all x values (left)
	left <- dtm_it(xu, pattern = pat, replacement = rep, ngrams = ngrams)

	## make a one-row document term matrix for each y value (right)
	rights <- lapply(yu, function(y_val) {
	o <- dtm_it(y_val, pattern = pat, replacement = rep, ngrams = ngrams)
	rownames(o) <- 'right'
	o
	})

	## get the distance measure function
	dm <- distance_measure(distance)

	## loop throu the y (right) document term matrices
	## 1. combine witht he x document term matrix
	## 2. weight as tf-idf
	## 3. compute simialrity
	locs <- unlist(lapply(rights, function(r){

	dtm <- c(left, r)
	dtm <- tm::weightTfIdf(dtm)

	dww <- downweight.words[downweight.words %in% colnames(dtm)]
	if (!is.null(dww) && length(dww) > 0) {
	dtm <- cbind(
	dtm[, !colnames(dtm) %in% dww],
	dtm[,dww]*downweight
	)
	}



	distmat <- dm(dtm)

	vals <- unlist(as.matrix(distmat)[nrow(dtm), -nrow(dtm)])
	if(any(vals <= cutoff)){
	which.min(vals)
	} else {
	NA
	}

	}), use.names = FALSE)

	xu_misses <- !seq_len(length(xu)) %in% locs

	lower_key <- dplyr::data_frame(
	xl = c(xu[locs], xu[xu_misses]),
	yl = c(yu, rep(NA, sum(xu_misses, na.rm = TRUE)))
	)

	dplyr::select(dplyr::full_join(
	dplyr::left_join(datx, lower_key, by = 'xl'),
	daty, by = 'yl'
	), x, y)
	}

	## look at bi vs unigram
	dtm_it <- function(x2, pattern, replacement, ngrams = 1, ...){

	if (!is.null(remove)) {
	x2 <- textclean::replace_tokens(x2, remove)
	}
	if (!is.null(remove)) {
	x2 <- textclean::mgsub(x2,
	pattern = pattern,
	replacement = replacement,
	fixed = FALSE
	)
	}


	quanteda::as.DocumentTermMatrix(quanteda::dfm(quanteda::tokens_tolower(
	quanteda::tokens(
	x2,
	remove_punct = FALSE,
	remove_numbers = TRUE,
	remove_hyphens = TRUE,
	ngrams = ngrams
	)
	)))
	}

	distance_measure <- function(dist){
	switch(dist,
	'cosine' = cosine_distance,
	'jaccard' = jaccard_distance,
	stop('provide a valid `distance` type')
	)
	}

	#' Optimized Computation of Cosine Distance
	#'
	#' Utilizes the \pkg{slam} package to efficiently calculate cosine distance
	#' on large sparse matrices.
	#'
	#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
	#' \code{\link[tm]{TermDocumentMatrix}}).
	#' @param \ldots ignored.
	#' @return Returns a cosine distance object of class \code{"dist"}.
	#' @references \url{http://stackoverflow.com/a/29755756/1000343}
	#' @keywords cosine dissimilarity
	#' @rdname cosine_distance
	#' @export
	#' @author Michael Andrec and Tyler Rinker <tyler.rinker@@gmail.com>.
	#' @examples
	#' library(gofastr)
	#' library(dplyr)
	#'
	#' out <- presidential_debates_2012 %>%
	#' with(q_dtm(dialogue)) %>%
	#' cosine_distance()
	cosine_distance <- function(x, ...){
	UseMethod("cosine_distance")
	}


	#' @export
	#' @rdname cosine_distance
	#' @method cosine_distance DocumentTermMatrix
	cosine_distance.DocumentTermMatrix <- function(x, ...){
	cosine_distance(slam::as.simple_triplet_matrix(x))
	}

	#' @export
	#' @rdname cosine_distance
	#' @method cosine_distance simple_triplet_matrix
	cosine_distance.simple_triplet_matrix <- function(x, ...){
	x <- t(x)
	stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2)))))
	}

	#' Optimized Computation of Jaccard Distance
	#'
	#' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance
	#' on large sparse matrices.
	#'
	#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
	#' \code{\link[tm]{TermDocumentMatrix}}).
	#' @param \ldots ignored.
	#' @return Returns a jaccard distance object of class \code{"dist"}.
	#' @references \url{http://stackoverflow.com/a/36373333/1000343}
	#' \url{http://stats.stackexchange.com/a/89947/7482}
	#' @keywords jaccard dissimilarity
	#' @rdname jaccard_distance
	#' @export
	#' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker <tyler.rinker@@gmail.com>.
	#' @examples
	#' library(gofastr)
	#' library(dplyr)
	#'
	#' out <- presidential_debates_2012 %>%
	#' with(q_dtm(dialogue)) %>%
	#' jaccard_distance()
	jaccard_distance <- function(x, ...){
	UseMethod("jaccard_distance")
	}


	#' @export
	#' @rdname jaccard_distance
	#' @method jaccard_distance DocumentTermMatrix
	jaccard_distance.DocumentTermMatrix <- function(x, ...){
	mat <-sign(x)
	A <- slam::tcrossprod_simple_triplet_matrix(mat)
	im <- which(A > 0, arr.ind=TRUE)
	b <- slam::row_sums(mat)
	Aim <- A[im]

	stats::as.dist(1 - Matrix::sparseMatrix(
	i = im[,1],
	j = im[,2],
	x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
	dims = dim(A)
	))
	}









	# x <- c('The university at st. charles', 'SUNY Berkshire', 'The univ. at Lockhaven-Pennslyvania',
	# 'Buffalo State', 'University at Buffalo', 'CUNY University-Boulder', 'Harvard College',
	# 'The university at st. charles')


	# y <- c('university at saint charles', 'Berkshire State College', 'University at Lockhaven',
	# 'SUNY Buffalo State College', 'SUNY University at Buffalo', 'University at Boulder', 'Yale University')

	## drop articles
	## abbreviation replacement

	# install.packages('gofastr')


	# cutoff <- .40
	# remove <- c('the', 'a', 'an', 'at', 'of', 'in', '-')
	# distance <- 'cosine'
	# ngrams <- 1
	# substitution = data.frame(
	# pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
	# replacement = c('university', 'saint', 'mount', ' and '),
	# stringsAsFactors = FALSE
	# )




	# fuzzy_map(x, y)