Skip to content

Instantly share code, notes, and snippets.

@tmasjc
Last active December 13, 2024 15:46
Show Gist options
  • Save tmasjc/4e743e3feffcc86480fd8e14a2d7d231 to your computer and use it in GitHub Desktop.
Save tmasjc/4e743e3feffcc86480fd8e14a2d7d231 to your computer and use it in GitHub Desktop.
寻找最相似或最不相似的长字符串
library(tidyverse)
library(text2vec)
library(stopwords)
library(jsonlite)
library(listviewer)
raw <- read_csv(rstudioapi::selectFile())
preprocess <- function(x, special_keywords) {
x |>
# remove timestamp
str_remove_all("\\n?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}") |>
# remove non-alphanumerics
str_remove_all("[^[:alnum:]]") |>
# remove special words
str_remove_all(paste0(special_keywords, collapse = "|"))
}
# do customize for specific use case
keywords <- c("以下是语音转文本数据", "核桃编程")
tidy_data <- raw |> mutate(clean_content = preprocess(content, keywords))
# convert to vector space, also discard very freq terms
tokens <- word_tokenizer(tidy_data$clean_content)
iters <- itoken(tokens, ids = tidy_data$sample_id, progressbar = FALSE)
vocabs <- iters |>
create_vocabulary(stopwords = stopwords(language = "zh", source = "misc")) |>
prune_vocabulary(term_count_max = length(tidy_data$content)/2)
# create dtm and compare similarity
# should be a length x length matrix
dtm <- create_dtm(iters, vocab_vectorizer(vocabs), type = "dgTMatrix")
sim <- sim2(dtm, dtm, method = "cosine", norm = "l2")
# a helper to extract ids from similarities matrix
find_unique_values <- function(sparse_matrix, n, mode) {
values = sparse_matrix@x
row_indices = sparse_matrix@i
col_pointers = sparse_matrix@p
row_names = sparse_matrix@Dimnames[[1]]
col_names = sparse_matrix@Dimnames[[2]]
# Filter out values equal to 1 (for both modes to simplify)
valid_indices <- which(values != 1)
filtered_values <- values[valid_indices]
filtered_row_indices <- row_indices[valid_indices]
filtered_valid_indices <- valid_indices # Save for column mapping
# Determine indices and values based on mode
if (mode == "lowest") {
indices <- order(filtered_values)[1:n]
ordered_values <- filtered_values[indices]
} else if (mode == "highest") {
indices <- order(-filtered_values)[1:n]
ordered_values <- filtered_values[indices]
}
# Map indices to rows and columns
rows <- row_names[row_indices[indices] + 1] # +1 because `i` is 0-indexed
cols <- sapply(indices, function(idx) {
which(col_pointers > idx)[1] - 1
})
cols <- col_names[cols]
# Combine into a data frame
result <- data.frame(
Row = rows,
Column = cols,
Value = ordered_values
)
# Identify unique values across rows and columns
unique_elements <- unique(c(result$Row, result$Column))
# Filter results to include only unique elements
if (length(unique_elements) > n) {
unique_elements <- unique_elements[1:n]
}
filtered_result <- result %>%
filter(Row %in% unique_elements | Column %in% unique_elements)
return(filtered_result)
}
# most NOT similar
res1 <- find_unique_values(sim, n = 3, mode = "lowest")
ids1 <- unique(c(res1$Row, res1$Column))
# most similar
res2 <- find_unique_values(sim, n = 3, mode = "highest")
ids2 <- unique(c(res2$Row, res2$Column))
# preview result via original dataset
raw |>
# filter(sample_id %in% ids1) |>
filter(sample_id %in% ids2) |>
jsonlite::toJSON() |>
listviewer::jsonedit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment