Created
January 10, 2019 13:54
-
-
Save trinker/ca42a4b46e6bb067ba9bf010a9c514c7 to your computer and use it in GitHub Desktop.
Fuzzy Mapping Between Multi-Word Strings. GCreates a map between 2 vectors with strings that are close but not identically named entities.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fuzzy_map <- function(x, y, distance = 'cosine', cutoff = .40, | |
remove = c('the', 'a', 'an', 'at', 'of', 'in', '-', ' - CL', ' - OS'), | |
substitution = data.frame( | |
pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'), | |
replacement = c('university', 'saint', 'mount', ' and '), | |
stringsAsFactors = FALSE | |
), | |
ngrams = 1, # choose 1 or 2 (2 is 2 words) | |
downweight.words = c('suny', 'cuny', 'college', 'university', 'state'), | |
# words that are less important - make smaller to make the words LESS important | |
downweight = .25, ... | |
){ | |
cutoff <- 1 - cutoff | |
if (!is.null(remove)) remove <- tolower(remove) | |
if (!is.null(downweight)) downweight.words <- tolower(downweight.words) | |
pat <- substitution[['pattern']] | |
rep <- substitution[['replacement']] | |
datx <- dplyr::data_frame( | |
x = x, | |
xl = tolower(x) | |
) | |
daty <- dplyr::data_frame( | |
y = y, | |
yl = tolower(y) | |
) | |
xu <- unique(datx[['xl']]) | |
yu <- unique(daty[['yl']]) | |
## make a document term matrix of all x values (left) | |
left <- dtm_it(xu, pattern = pat, replacement = rep, ngrams = ngrams) | |
## make a one-row document term matrix for each y value (right) | |
rights <- lapply(yu, function(y_val) { | |
o <- dtm_it(y_val, pattern = pat, replacement = rep, ngrams = ngrams) | |
rownames(o) <- 'right' | |
o | |
}) | |
## get the distance measure function | |
dm <- distance_measure(distance) | |
## loop throu the y (right) document term matrices | |
## 1. combine witht he x document term matrix | |
## 2. weight as tf-idf | |
## 3. compute simialrity | |
locs <- unlist(lapply(rights, function(r){ | |
dtm <- c(left, r) | |
dtm <- tm::weightTfIdf(dtm) | |
dww <- downweight.words[downweight.words %in% colnames(dtm)] | |
if (!is.null(dww) && length(dww) > 0) { | |
dtm <- cbind( | |
dtm[, !colnames(dtm) %in% dww], | |
dtm[,dww]*downweight | |
) | |
} | |
distmat <- dm(dtm) | |
vals <- unlist(as.matrix(distmat)[nrow(dtm), -nrow(dtm)]) | |
if(any(vals <= cutoff)){ | |
which.min(vals) | |
} else { | |
NA | |
} | |
}), use.names = FALSE) | |
xu_misses <- !seq_len(length(xu)) %in% locs | |
lower_key <- dplyr::data_frame( | |
xl = c(xu[locs], xu[xu_misses]), | |
yl = c(yu, rep(NA, sum(xu_misses, na.rm = TRUE))) | |
) | |
dplyr::select(dplyr::full_join( | |
dplyr::left_join(datx, lower_key, by = 'xl'), | |
daty, by = 'yl' | |
), x, y) | |
} | |
## look at bi vs unigram | |
dtm_it <- function(x2, pattern, replacement, ngrams = 1, ...){ | |
if (!is.null(remove)) { | |
x2 <- textclean::replace_tokens(x2, remove) | |
} | |
if (!is.null(remove)) { | |
x2 <- textclean::mgsub(x2, | |
pattern = pattern, | |
replacement = replacement, | |
fixed = FALSE | |
) | |
} | |
quanteda::as.DocumentTermMatrix(quanteda::dfm(quanteda::tokens_tolower( | |
quanteda::tokens( | |
x2, | |
remove_punct = FALSE, | |
remove_numbers = TRUE, | |
remove_hyphens = TRUE, | |
ngrams = ngrams | |
) | |
))) | |
} | |
distance_measure <- function(dist){ | |
switch(dist, | |
'cosine' = cosine_distance, | |
'jaccard' = jaccard_distance, | |
stop('provide a valid `distance` type') | |
) | |
} | |
#' Optimized Computation of Cosine Distance | |
#' | |
#' Utilizes the \pkg{slam} package to efficiently calculate cosine distance | |
#' on large sparse matrices. | |
#' | |
#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or | |
#' \code{\link[tm]{TermDocumentMatrix}}). | |
#' @param \ldots ignored. | |
#' @return Returns a cosine distance object of class \code{"dist"}. | |
#' @references \url{http://stackoverflow.com/a/29755756/1000343} | |
#' @keywords cosine dissimilarity | |
#' @rdname cosine_distance | |
#' @export | |
#' @author Michael Andrec and Tyler Rinker <tyler.rinker@@gmail.com>. | |
#' @examples | |
#' library(gofastr) | |
#' library(dplyr) | |
#' | |
#' out <- presidential_debates_2012 %>% | |
#' with(q_dtm(dialogue)) %>% | |
#' cosine_distance() | |
cosine_distance <- function(x, ...){ | |
UseMethod("cosine_distance") | |
} | |
#' @export | |
#' @rdname cosine_distance | |
#' @method cosine_distance DocumentTermMatrix | |
cosine_distance.DocumentTermMatrix <- function(x, ...){ | |
cosine_distance(slam::as.simple_triplet_matrix(x)) | |
} | |
#' @export | |
#' @rdname cosine_distance | |
#' @method cosine_distance simple_triplet_matrix | |
cosine_distance.simple_triplet_matrix <- function(x, ...){ | |
x <- t(x) | |
stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2))))) | |
} | |
#' Optimized Computation of Jaccard Distance | |
#' | |
#' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance | |
#' on large sparse matrices. | |
#' | |
#' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or | |
#' \code{\link[tm]{TermDocumentMatrix}}). | |
#' @param \ldots ignored. | |
#' @return Returns a jaccard distance object of class \code{"dist"}. | |
#' @references \url{http://stackoverflow.com/a/36373333/1000343} | |
#' \url{http://stats.stackexchange.com/a/89947/7482} | |
#' @keywords jaccard dissimilarity | |
#' @rdname jaccard_distance | |
#' @export | |
#' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker <tyler.rinker@@gmail.com>. | |
#' @examples | |
#' library(gofastr) | |
#' library(dplyr) | |
#' | |
#' out <- presidential_debates_2012 %>% | |
#' with(q_dtm(dialogue)) %>% | |
#' jaccard_distance() | |
jaccard_distance <- function(x, ...){ | |
UseMethod("jaccard_distance") | |
} | |
#' @export | |
#' @rdname jaccard_distance | |
#' @method jaccard_distance DocumentTermMatrix | |
jaccard_distance.DocumentTermMatrix <- function(x, ...){ | |
mat <-sign(x) | |
A <- slam::tcrossprod_simple_triplet_matrix(mat) | |
im <- which(A > 0, arr.ind=TRUE) | |
b <- slam::row_sums(mat) | |
Aim <- A[im] | |
stats::as.dist(1 - Matrix::sparseMatrix( | |
i = im[,1], | |
j = im[,2], | |
x = Aim / (b[im[,1]] + b[im[,2]] - Aim), | |
dims = dim(A) | |
)) | |
} | |
# x <- c('The university at st. charles', 'SUNY Berkshire', 'The univ. at Lockhaven-Pennslyvania', | |
# 'Buffalo State', 'University at Buffalo', 'CUNY University-Boulder', 'Harvard College', | |
# 'The university at st. charles') | |
# y <- c('university at saint charles', 'Berkshire State College', 'University at Lockhaven', | |
# 'SUNY Buffalo State College', 'SUNY University at Buffalo', 'University at Boulder', 'Yale University') | |
## drop articles | |
## abbreviation replacement | |
# install.packages('gofastr') | |
# cutoff <- .40 | |
# remove <- c('the', 'a', 'an', 'at', 'of', 'in', '-') | |
# distance <- 'cosine' | |
# ngrams <- 1 | |
# substitution = data.frame( | |
# pattern = c('univ\\.', '\\bst\\.', '\\bmt\\.', '&'), | |
# replacement = c('university', 'saint', 'mount', ' and '), | |
# stringsAsFactors = FALSE | |
# ) | |
# fuzzy_map(x, y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment