Last active
December 13, 2024 15:46
-
-
Save tmasjc/4e743e3feffcc86480fd8e14a2d7d231 to your computer and use it in GitHub Desktop.
寻找最相似或最不相似的长字符串
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(text2vec) | |
library(stopwords) | |
library(jsonlite) | |
library(listviewer) | |
raw <- read_csv(rstudioapi::selectFile()) | |
preprocess <- function(x, special_keywords) { | |
x |> | |
# remove timestamp | |
str_remove_all("\\n?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}") |> | |
# remove non-alphanumerics | |
str_remove_all("[^[:alnum:]]") |> | |
# remove special words | |
str_remove_all(paste0(special_keywords, collapse = "|")) | |
} | |
# do customize for specific use case | |
keywords <- c("以下是语音转文本数据", "核桃编程") | |
tidy_data <- raw |> mutate(clean_content = preprocess(content, keywords)) | |
# convert to vector space, also discard very freq terms | |
tokens <- word_tokenizer(tidy_data$clean_content) | |
iters <- itoken(tokens, ids = tidy_data$sample_id, progressbar = FALSE) | |
vocabs <- iters |> | |
create_vocabulary(stopwords = stopwords(language = "zh", source = "misc")) |> | |
prune_vocabulary(term_count_max = length(tidy_data$content)/2) | |
# create dtm and compare similarity | |
# should be a length x length matrix | |
dtm <- create_dtm(iters, vocab_vectorizer(vocabs), type = "dgTMatrix") | |
sim <- sim2(dtm, dtm, method = "cosine", norm = "l2") | |
# a helper to extract ids from similarities matrix | |
find_unique_values <- function(sparse_matrix, n, mode) { | |
values = sparse_matrix@x | |
row_indices = sparse_matrix@i | |
col_pointers = sparse_matrix@p | |
row_names = sparse_matrix@Dimnames[[1]] | |
col_names = sparse_matrix@Dimnames[[2]] | |
# Filter out values equal to 1 (for both modes to simplify) | |
valid_indices <- which(values != 1) | |
filtered_values <- values[valid_indices] | |
filtered_row_indices <- row_indices[valid_indices] | |
filtered_valid_indices <- valid_indices # Save for column mapping | |
# Determine indices and values based on mode | |
if (mode == "lowest") { | |
indices <- order(filtered_values)[1:n] | |
ordered_values <- filtered_values[indices] | |
} else if (mode == "highest") { | |
indices <- order(-filtered_values)[1:n] | |
ordered_values <- filtered_values[indices] | |
} | |
# Map indices to rows and columns | |
rows <- row_names[row_indices[indices] + 1] # +1 because `i` is 0-indexed | |
cols <- sapply(indices, function(idx) { | |
which(col_pointers > idx)[1] - 1 | |
}) | |
cols <- col_names[cols] | |
# Combine into a data frame | |
result <- data.frame( | |
Row = rows, | |
Column = cols, | |
Value = ordered_values | |
) | |
# Identify unique values across rows and columns | |
unique_elements <- unique(c(result$Row, result$Column)) | |
# Filter results to include only unique elements | |
if (length(unique_elements) > n) { | |
unique_elements <- unique_elements[1:n] | |
} | |
filtered_result <- result %>% | |
filter(Row %in% unique_elements | Column %in% unique_elements) | |
return(filtered_result) | |
} | |
# most NOT similar | |
res1 <- find_unique_values(sim, n = 3, mode = "lowest") | |
ids1 <- unique(c(res1$Row, res1$Column)) | |
# most similar | |
res2 <- find_unique_values(sim, n = 3, mode = "highest") | |
ids2 <- unique(c(res2$Row, res2$Column)) | |
# preview result via original dataset | |
raw |> | |
# filter(sample_id %in% ids1) |> | |
filter(sample_id %in% ids2) |> | |
jsonlite::toJSON() |> | |
listviewer::jsonedit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment