Skip to content

Instantly share code, notes, and snippets.

@trinker
Created January 10, 2019 13:54
Show Gist options
  • Save trinker/ca42a4b46e6bb067ba9bf010a9c514c7 to your computer and use it in GitHub Desktop.
Save trinker/ca42a4b46e6bb067ba9bf010a9c514c7 to your computer and use it in GitHub Desktop.
Fuzzy Mapping Between Multi-Word Strings. GCreates a map between 2 vectors with strings that are close but not identically named entities.
fuzzy_map <- function(x, y, distance = 'cosine', cutoff = .40,
remove = c('the', 'a', 'an', 'at', 'of', 'in', '-', ' - CL', ' - OS'),
substitution = data.frame(
pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
replacement = c('university', 'saint', 'mount', ' and '),
stringsAsFactors = FALSE
),
ngrams = 1, # choose 1 or 2 (2 is 2 words)
downweight.words = c('suny', 'cuny', 'college', 'university', 'state'),
# words that are less important - make smaller to make the words LESS important
downweight = .25, ...
){
cutoff <- 1 - cutoff
if (!is.null(remove)) remove <- tolower(remove)
if (!is.null(downweight)) downweight.words <- tolower(downweight.words)
pat <- substitution[['pattern']]
rep <- substitution[['replacement']]
datx <- dplyr::data_frame(
x = x,
xl = tolower(x)
)
daty <- dplyr::data_frame(
y = y,
yl = tolower(y)
)
xu <- unique(datx[['xl']])
yu <- unique(daty[['yl']])
## make a document term matrix of all x values (left)
left <- dtm_it(xu, pattern = pat, replacement = rep, ngrams = ngrams)
## make a one-row document term matrix for each y value (right)
rights <- lapply(yu, function(y_val) {
o <- dtm_it(y_val, pattern = pat, replacement = rep, ngrams = ngrams)
rownames(o) <- 'right'
o
})
## get the distance measure function
dm <- distance_measure(distance)
## loop throu the y (right) document term matrices
## 1. combine witht he x document term matrix
## 2. weight as tf-idf
## 3. compute simialrity
locs <- unlist(lapply(rights, function(r){
dtm <- c(left, r)
dtm <- tm::weightTfIdf(dtm)
dww <- downweight.words[downweight.words %in% colnames(dtm)]
if (!is.null(dww) && length(dww) > 0) {
dtm <- cbind(
dtm[, !colnames(dtm) %in% dww],
dtm[,dww]*downweight
)
}
distmat <- dm(dtm)
vals <- unlist(as.matrix(distmat)[nrow(dtm), -nrow(dtm)])
if(any(vals <= cutoff)){
which.min(vals)
} else {
NA
}
}), use.names = FALSE)
xu_misses <- !seq_len(length(xu)) %in% locs
lower_key <- dplyr::data_frame(
xl = c(xu[locs], xu[xu_misses]),
yl = c(yu, rep(NA, sum(xu_misses, na.rm = TRUE)))
)
dplyr::select(dplyr::full_join(
dplyr::left_join(datx, lower_key, by = 'xl'),
daty, by = 'yl'
), x, y)
}
## look at bi vs unigram
dtm_it <- function(x2, pattern, replacement, ngrams = 1, ...){
if (!is.null(remove)) {
x2 <- textclean::replace_tokens(x2, remove)
}
if (!is.null(remove)) {
x2 <- textclean::mgsub(x2,
pattern = pattern,
replacement = replacement,
fixed = FALSE
)
}
quanteda::as.DocumentTermMatrix(quanteda::dfm(quanteda::tokens_tolower(
quanteda::tokens(
x2,
remove_punct = FALSE,
remove_numbers = TRUE,
remove_hyphens = TRUE,
ngrams = ngrams
)
)))
}
distance_measure <- function(dist){
switch(dist,
'cosine' = cosine_distance,
'jaccard' = jaccard_distance,
stop('provide a valid `distance` type')
)
}
#' Optimized Computation of Cosine Distance
#'
#' Utilizes the \pkg{slam} package to efficiently calculate cosine distance
#' on large sparse matrices.
#'
#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
#' \code{\link[tm]{TermDocumentMatrix}}).
#' @param \ldots ignored.
#' @return Returns a cosine distance object of class \code{"dist"}.
#' @references \url{http://stackoverflow.com/a/29755756/1000343}
#' @keywords cosine dissimilarity
#' @rdname cosine_distance
#' @export
#' @author Michael Andrec and Tyler Rinker <tyler.rinker@@gmail.com>.
#' @examples
#' library(gofastr)
#' library(dplyr)
#'
#' out <- presidential_debates_2012 %>%
#' with(q_dtm(dialogue)) %>%
#' cosine_distance()
cosine_distance <- function(x, ...){
UseMethod("cosine_distance")
}
#' @export
#' @rdname cosine_distance
#' @method cosine_distance DocumentTermMatrix
cosine_distance.DocumentTermMatrix <- function(x, ...){
cosine_distance(slam::as.simple_triplet_matrix(x))
}
#' @export
#' @rdname cosine_distance
#' @method cosine_distance simple_triplet_matrix
cosine_distance.simple_triplet_matrix <- function(x, ...){
x <- t(x)
stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2)))))
}
#' Optimized Computation of Jaccard Distance
#'
#' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance
#' on large sparse matrices.
#'
#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
#' \code{\link[tm]{TermDocumentMatrix}}).
#' @param \ldots ignored.
#' @return Returns a jaccard distance object of class \code{"dist"}.
#' @references \url{http://stackoverflow.com/a/36373333/1000343}
#' \url{http://stats.stackexchange.com/a/89947/7482}
#' @keywords jaccard dissimilarity
#' @rdname jaccard_distance
#' @export
#' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker <tyler.rinker@@gmail.com>.
#' @examples
#' library(gofastr)
#' library(dplyr)
#'
#' out <- presidential_debates_2012 %>%
#' with(q_dtm(dialogue)) %>%
#' jaccard_distance()
jaccard_distance <- function(x, ...){
UseMethod("jaccard_distance")
}
#' @export
#' @rdname jaccard_distance
#' @method jaccard_distance DocumentTermMatrix
jaccard_distance.DocumentTermMatrix <- function(x, ...){
mat <-sign(x)
A <- slam::tcrossprod_simple_triplet_matrix(mat)
im <- which(A > 0, arr.ind=TRUE)
b <- slam::row_sums(mat)
Aim <- A[im]
stats::as.dist(1 - Matrix::sparseMatrix(
i = im[,1],
j = im[,2],
x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
dims = dim(A)
))
}
# x <- c('The university at st. charles', 'SUNY Berkshire', 'The univ. at Lockhaven-Pennslyvania',
# 'Buffalo State', 'University at Buffalo', 'CUNY University-Boulder', 'Harvard College',
# 'The university at st. charles')
# y <- c('university at saint charles', 'Berkshire State College', 'University at Lockhaven',
# 'SUNY Buffalo State College', 'SUNY University at Buffalo', 'University at Boulder', 'Yale University')
## drop articles
## abbreviation replacement
# install.packages('gofastr')
# cutoff <- .40
# remove <- c('the', 'a', 'an', 'at', 'of', 'in', '-')
# distance <- 'cosine'
# ngrams <- 1
# substitution = data.frame(
# pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'),
# replacement = c('university', 'saint', 'mount', ' and '),
# stringsAsFactors = FALSE
# )
# fuzzy_map(x, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment